mirror of https://github.com/msberends/AMR
51 changed files with 432 additions and 1055 deletions
@ -1,682 +0,0 @@
@@ -1,682 +0,0 @@
|
||||
# ==================================================================== # |
||||
# TITLE # |
||||
# Antimicrobial Resistance (AMR) Analysis # |
||||
# # |
||||
# SOURCE # |
||||
# https://gitlab.com/msberends/AMR # |
||||
# # |
||||
# LICENCE # |
||||
# (c) 2018-2020 Berends MS, Luz CF et al. # |
||||
# # |
||||
# This R package is free software; you can freely use and distribute # |
||||
# it for both personal and commercial purposes under the terms of the # |
||||
# GNU General Public License version 2.0 (GNU GPL-2), as published by # |
||||
# the Free Software Foundation. # |
||||
# # |
||||
# We created this package for both routine data analysis and academic # |
||||
# research and it was publicly released in the hope that it will be # |
||||
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. # |
||||
# Visit our website for more info: https://msberends.gitlab.io/AMR. # |
||||
# ==================================================================== # |
||||
|
||||
# --------------------------------------------------------------------------------- |
||||
# Reproduction of the `microorganisms` data set |
||||
# --------------------------------------------------------------------------------- |
||||
# Data retrieved from: |
||||
# |
||||
# [1] Catalogue of Life (CoL) through the Encyclopaedia of Life |
||||
# https://opendata.eol.org/dataset/catalogue-of-life/ |
||||
# * Download the resource file with a name like "Catalogue of Life yyyy-mm-dd" |
||||
# * Extract "taxon.tab" |
||||
# |
||||
# [2] Global Biodiversity Information Facility (GBIF) |
||||
# https://doi.org/10.15468/39omei |
||||
# * Extract "Taxon.tsv" |
||||
# |
||||
# [3] Deutsche Sammlung von Mikroorganismen und Zellkulturen (DSMZ) |
||||
# https://www.dsmz.de/support/bacterial-nomenclature-up-to-date-downloads.html |
||||
# * Download the latest "Complete List" as xlsx file (DSMZ_bactnames.xlsx) |
||||
# --------------------------------------------------------------------------------- |
||||
|
||||
library(dplyr) |
||||
library(AMR) |
||||
|
||||
data_col <- data.table::fread("Documents/taxon.tab") |
||||
data_gbif <- data.table::fread("Documents/Taxon.tsv") |
||||
|
||||
# read the xlsx file from DSMZ (only around 2.5 MB): |
||||
data_dsmz <- readxl::read_xlsx("Downloads/DSMZ_bactnames.xlsx") |
||||
|
||||
# the CoL data is over 3.7M rows: |
||||
data_col %>% freq(kingdom) |
||||
# Item Count Percent Cum. Count Cum. Percent |
||||
# --- ---------- ---------- -------- ----------- ------------- |
||||
# 1 Animalia 2,225,627 59.1% 2,225,627 59.1% |
||||
# 2 Plantae 1,177,412 31.3% 3,403,039 90.4% |
||||
# 3 Fungi 290,145 7.7% 3,693,184 98.1% |
||||
# 4 Chromista 47,126 1.3% 3,740,310 99.3% |
||||
# 5 Bacteria 14,478 0.4% 3,754,788 99.7% |
||||
# 6 Protozoa 6,060 0.2% 3,760,848 99.9% |
||||
# 7 Viruses 3,827 0.1% 3,764,675 100.0% |
||||
# 8 Archaea 610 0.0% 3,765,285 100.0% |
||||
|
||||
# the GBIF data is over 5.8M rows: |
||||
data_gbif %>% freq(kingdom) |
||||
# Item Count Percent Cum. Count Cum. Percent |
||||
# --- --------------- ---------- -------- ----------- ------------- |
||||
# 1 Animalia 3,264,138 55.7% 3,264,138 55.7% |
||||
# 2 Plantae 1,814,962 31.0% 5,079,100 86.7% |
||||
# 3 Fungi 538,086 9.2% 5,617,186 95.9% |
||||
# 4 Chromista 181,374 3.1% 5,798,560 99.0% |
||||
# 5 Bacteria 24,048 0.4% 5,822,608 99.4% |
||||
# 6 Protozoa 15,138 0.3% 5,837,746 99.7% |
||||
# 7 incertae sedis 9,995 0.2% 5,847,741 99.8% |
||||
# 8 Viruses 9,630 0.2% 5,857,371 100.0% |
||||
# 9 Archaea 771 0.0% 5,858,142 100.0% |
||||
|
||||
|
||||
# Clean up helper function ------------------------------------------------ |
||||
clean_new <- function(new) { |
||||
new %>% |
||||
# only the ones that have no new ID to refer to a newer name |
||||
filter(is.na(col_id_new)) %>% |
||||
filter( |
||||
( |
||||
# we only want all MICROorganisms and no viruses |
||||
!kingdom %in% c("Animalia", "Chromista", "Plantae", "Viruses") |
||||
# and not all fungi: Aspergillus, Candida, Trichphyton and Pneumocystis are the most important, |
||||
# so only keep these orders from the fungi: |
||||
& !(kingdom == "Fungi" |
||||
& !order %in% c("Eurotiales", "Saccharomycetales", "Schizosaccharomycetales", "Tremellales", "Onygenales", "Pneumocystales")) |
||||
) |
||||
# or the family has to contain a genus we found in our hospitals last decades (Northern Netherlands, 2002-2018) |
||||
| genus %in% c("Absidia", "Acremonium", "Actinotignum", "Alternaria", "Anaerosalibacter", "Ancylostoma", "Anisakis", "Apophysomyces", |
||||
"Arachnia", "Ascaris", "Aureobacterium", "Aureobasidium", "Balantidum", "Bilophilia", "Branhamella", "Brochontrix", |
||||
"Brugia", "Calymmatobacterium", "Catabacter", "Chilomastix", "Chryseomonas", "Cladophialophora", "Cladosporium", |
||||
"Clonorchis", "Cordylobia", "Curvularia", "Demodex", "Dermatobia", "Diphyllobothrium", "Dracunculus", "Echinococcus", |
||||
"Enterobius", "Euascomycetes", "Exophiala", "Fasciola", "Fusarium", "Hendersonula", "Hymenolepis", "Kloeckera", |
||||
"Koserella", "Larva", "Leishmania", "Lelliottia", "Loa", "Lumbricus", "Malassezia", "Metagonimus", "Molonomonas", |
||||
"Mucor", "Nattrassia", "Necator", "Novospingobium", "Onchocerca", "Opistorchis", "Paragonimus", "Paramyxovirus", |
||||
"Pediculus", "Phoma", "Phthirus", "Pityrosporum", "Pseudallescheria", "Pulex", "Rhizomucor", "Rhizopus", "Rhodotorula", |
||||
"Salinococcus", "Sanguibacteroides", "Schistosoma", "Scopulariopsis", "Scytalidium", "Sporobolomyces", "Stomatococcus", |
||||
"Strongyloides", "Syncephalastraceae", "Taenia", "Torulopsis", "Trichinella", "Trichobilharzia", "Trichomonas", |
||||
"Trichosporon", "Trichuris", "Trypanosoma", "Wuchereria")) %>% |
||||
mutate( |
||||
authors2 = iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT"), |
||||
# remove leading and trailing brackets |
||||
authors2 = gsub("^[(](.*)[)]$", "\\1", authors2), |
||||
# only take part after brackets if there's a name |
||||
authors2 = ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2), |
||||
gsub(".*[)] (.*)", "\\1", authors2), |
||||
authors2), |
||||
# get year from last 4 digits |
||||
lastyear = as.integer(gsub(".*([0-9]{4})$", "\\1", authors2)), |
||||
# can never be later than now |
||||
lastyear = ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")), |
||||
NA, |
||||
lastyear), |
||||
# get authors without last year |
||||
authors = gsub("(.*)[0-9]{4}$", "\\1", authors2), |
||||
# remove nonsense characters from names |
||||
authors = gsub("[^a-zA-Z,'& -]", "", authors), |
||||
# remove trailing and leading spaces |
||||
authors = trimws(authors), |
||||
# only keep first author and replace all others by 'et al' |
||||
authors = gsub("(,| and| et| &| ex| emend\\.?) .*", " et al.", authors), |
||||
# et al. always with ending dot |
||||
authors = gsub(" et al\\.?", " et al.", authors), |
||||
authors = gsub(" ?,$", "", authors), |
||||
# don't start with 'sensu' or 'ehrenb' |
||||
authors = gsub("^(sensu|Ehrenb.?) ", "", authors, ignore.case = TRUE), |
||||
# no initials, only surname |
||||
authors = gsub("^([A-Z]+ )+", "", authors, ignore.case = FALSE), |
||||
# combine author and year if year is available |
||||
ref = ifelse(!is.na(lastyear), |
||||
paste0(authors, ", ", lastyear), |
||||
authors), |
||||
# fix beginning and ending |
||||
ref = gsub(", $", "", ref), |
||||
ref = gsub("^, ", "", ref)) %>% |
||||
# remove text if it contains 'Not assigned' like phylum in viruses |
||||
mutate_all(~gsub("Not assigned", "", .)) %>% |
||||
# Remove non-ASCII characters (these are not allowed by CRAN) |
||||
lapply(iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>% |
||||
as_tibble(stringsAsFactors = FALSE) %>% |
||||
mutate(fullname = trimws(case_when(rank == "family" ~ family, |
||||
rank == "order" ~ order, |
||||
rank == "class" ~ class, |
||||
rank == "phylum" ~ phylum, |
||||
rank == "kingdom" ~ kingdom, |
||||
TRUE ~ paste(genus, species, subspecies)))) |
||||
} |
||||
clean_old <- function(old, new) { |
||||
old %>% |
||||
# only the ones that exist in the new data set |
||||
filter(col_id_new %in% new$col_id) %>% |
||||
mutate( |
||||
authors2 = iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT"), |
||||
# remove leading and trailing brackets |
||||
authors2 = gsub("^[(](.*)[)]$", "\\1", authors2), |
||||
# only take part after brackets if there's a name |
||||
authors2 = ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2), |
||||
gsub(".*[)] (.*)", "\\1", authors2), |
||||
authors2), |
||||
# get year from last 4 digits |
||||
lastyear = as.integer(gsub(".*([0-9]{4})$", "\\1", authors2)), |
||||
# can never be later than now |
||||
lastyear = ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")), |
||||
NA, |
||||
lastyear), |
||||
# get authors without last year |
||||
authors = gsub("(.*)[0-9]{4}$", "\\1", authors2), |
||||
# remove nonsense characters from names |
||||
authors = gsub("[^a-zA-Z,'& -]", "", authors), |
||||
# remove trailing and leading spaces |
||||
authors = trimws(authors), |
||||
# only keep first author and replace all others by 'et al' |
||||
authors = gsub("(,| and| et| &| ex| emend\\.?) .*", " et al.", authors), |
||||
# et al. always with ending dot |
||||
authors = gsub(" et al\\.?", " et al.", authors), |
||||
authors = gsub(" ?,$", "", authors), |
||||
# don't start with 'sensu' or 'ehrenb' |
||||
authors = gsub("^(sensu|Ehrenb.?) ", "", authors, ignore.case = TRUE), |
||||
# no initials, only surname |
||||
authors = gsub("^([A-Z]+ )+", "", authors, ignore.case = FALSE), |
||||
# combine author and year if year is available |
||||
ref = ifelse(!is.na(lastyear), |
||||
paste0(authors, ", ", lastyear), |
||||
authors), |
||||
# fix beginning and ending |
||||
ref = gsub(", $", "", ref), |
||||
ref = gsub("^, ", "", ref)) %>% |
||||
# remove text if it contains 'Not assigned' like phylum in viruses |
||||
mutate_all(~gsub("Not assigned", "", .)) %>% |
||||
# Remove non-ASCII characters (these are not allowed by CRAN) |
||||
lapply(iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>% |
||||
as_tibble(stringsAsFactors = FALSE) %>% |
||||
select(col_id_new, fullname, ref, authors2) %>% |
||||
left_join(new %>% select(col_id, fullname_new = fullname), by = c(col_id_new = "col_id")) %>% |
||||
mutate(fullname = trimws( |
||||
gsub("(.*)[(].*", "\\1", |
||||
stringr::str_replace( |
||||
string = fullname, |
||||
pattern = stringr::fixed(authors2), |
||||
replacement = "")) %>% |
||||
gsub(" (var|f|subsp)[.]", "", .))) %>% |
||||
select(-c("col_id_new", "authors2")) %>% |
||||
filter(!is.na(fullname), !is.na(fullname_new)) %>% |
||||
filter(fullname != fullname_new, !fullname %like% "^[?]") |
||||
} |
||||
|
||||
# clean CoL and GBIF ---- |
||||
# clean data_col |
||||
data_col <- data_col %>% |
||||
as_tibble() %>% |
||||
select(col_id = taxonID, |
||||
col_id_new = acceptedNameUsageID, |
||||
fullname = scientificName, |
||||
kingdom, |
||||
phylum, |
||||
class, |
||||
order, |
||||
family, |
||||
genus, |
||||
species = specificEpithet, |
||||
subspecies = infraspecificEpithet, |
||||
rank = taxonRank, |
||||
ref = scientificNameAuthorship, |
||||
species_id = furtherInformationURL) %>% |
||||
mutate(source = "CoL") |
||||
# split into old and new |
||||
data_col.new <- data_col %>% clean_new() |
||||
data_col.old <- data_col %>% clean_old(new = data_col.new) |
||||
rm(data_col) |
||||
|
||||
# clean data_gbif |
||||
data_gbif <- data_gbif %>% |
||||
as_tibble() %>% |
||||
filter( |
||||
# no uncertain taxonomic placements |
||||
taxonRemarks != "doubtful", |
||||
kingdom != "incertae sedis", |
||||
taxonRank != "unranked") %>% |
||||
transmute(col_id = taxonID, |
||||
col_id_new = acceptedNameUsageID, |
||||
fullname = scientificName, |
||||
kingdom, |
||||
phylum, |
||||
class, |
||||
order, |
||||
family, |
||||
genus, |
||||
species = specificEpithet, |
||||
subspecies = infraspecificEpithet, |
||||
rank = taxonRank, |
||||
ref = scientificNameAuthorship, |
||||
species_id = as.character(parentNameUsageID)) %>% |
||||
mutate(source = "GBIF") |
||||
# split into old and new |
||||
data_gbif.new <- data_gbif %>% clean_new() |
||||
data_gbif.old <- data_gbif %>% clean_old(new = data_gbif.new) |
||||
rm(data_gbif) |
||||
|
||||
# put CoL and GBIF together ---- |
||||
MOs.new <- bind_rows(data_col.new, |
||||
data_gbif.new) %>% |
||||
mutate(taxonomic_tree_length = nchar(trimws(paste(kingdom, phylum, class, order, family, genus, species, subspecies)))) %>% |
||||
arrange(desc(taxonomic_tree_length)) %>% |
||||
distinct(fullname, .keep_all = TRUE) %>% |
||||
select(-c("col_id_new", "authors2", "authors", "lastyear", "taxonomic_tree_length")) %>% |
||||
arrange(fullname) |
||||
MOs.old <- bind_rows(data_col.old, |
||||
data_gbif.old) %>% |
||||
distinct(fullname, .keep_all = TRUE) %>% |
||||
arrange(fullname) |
||||
|
||||
# clean up DSMZ --- |
||||
data_dsmz <- data_dsmz %>% |
||||
as_tibble() %>% |
||||
transmute(col_id = NA_integer_, |
||||
col_id_new = NA_integer_, |
||||
fullname = "", |
||||
# kingdom = "", |
||||
# phylum = "", |
||||
# class = "", |
||||
# order = "", |
||||
# family = "", |
||||
genus = ifelse(is.na(GENUS), "", GENUS), |
||||
species = ifelse(is.na(SPECIES), "", SPECIES), |
||||
subspecies = ifelse(is.na(SUBSPECIES), "", SUBSPECIES), |
||||
rank = ifelse(species == "", "genus", "species"), |
||||
ref = AUTHORS, |
||||
species_id = as.character(RECORD_NO), |
||||
source = "DSMZ") |
||||
|
||||
# DSMZ only contains genus/(sub)species, try to find taxonomic properties based on genus and data_col |
||||
ref_taxonomy <- MOs.new %>% |
||||
distinct(genus, .keep_all = TRUE) %>% |
||||
filter(family != "") %>% |
||||
filter(genus %in% data_dsmz$genus) %>% |
||||
distinct(genus, .keep_all = TRUE) %>% |
||||
select(kingdom, phylum, class, order, family, genus) |
||||
|
||||
data_dsmz <- data_dsmz %>% |
||||
left_join(ref_taxonomy, by = "genus") %>% |
||||
mutate(kingdom = "Bacteria") |
||||
|
||||
data_dsmz.new <- data_dsmz %>% |
||||
clean_new() %>% |
||||
distinct(fullname, .keep_all = TRUE) %>% |
||||
select(colnames(MOs.new)) %>% |
||||
arrange(fullname) |
||||
|
||||
# combine everything ---- |
||||
MOs <- bind_rows(MOs.new, |
||||
data_dsmz.new) %>% |
||||
distinct(fullname, .keep_all = TRUE) %>% |
||||
# not the ones that are old |
||||
filter(!fullname %in% MOs.old$fullname) %>% |
||||
arrange(fullname) %>% |
||||
mutate(col_id = ifelse(source != "CoL", NA_integer_, col_id)) %>% |
||||
filter(fullname != "") |
||||
|
||||
rm(data_col.new) |
||||
rm(data_col.old) |
||||
rm(data_gbif.new) |
||||
rm(data_gbif.old) |
||||
rm(data_dsmz) |
||||
rm(data_dsmz.new) |
||||
rm(ref_taxonomy) |
||||
rm(MOs.new) |
||||
|
||||
MOs.bak <- MOs |
||||
|
||||
# Trichomonas trick ---- |
||||
# for species in Trypanosoma and Trichomonas we observe al lot of taxonomic info missing |
||||
MOs %>% filter(genus %in% c("Trypanosoma", "Trichomonas")) %>% View() |
||||
MOs[which(MOs$genus == "Trypanosoma"), "kingdom"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$kingdom |
||||
MOs[which(MOs$genus == "Trypanosoma"), "phylum"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$phylum |
||||
MOs[which(MOs$genus == "Trypanosoma"), "class"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$class |
||||
MOs[which(MOs$genus == "Trypanosoma"), "order"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$order |
||||
MOs[which(MOs$genus == "Trypanosoma"), "family"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$family |
||||
MOs[which(MOs$genus == "Trichomonas"), "kingdom"] <- MOs[which(MOs$fullname == "Trichomonas"),]$kingdom |
||||
MOs[which(MOs$genus == "Trichomonas"), "phylum"] <- MOs[which(MOs$fullname == "Trichomonas"),]$phylum |
||||
MOs[which(MOs$genus == "Trichomonas"), "class"] <- MOs[which(MOs$fullname == "Trichomonas"),]$class |
||||
MOs[which(MOs$genus == "Trichomonas"), "order"] <- MOs[which(MOs$fullname == "Trichomonas"),]$order |
||||
MOs[which(MOs$genus == "Trichomonas"), "family"] <- MOs[which(MOs$fullname == "Trichomonas"),]$family |
||||
|
||||
# fill taxonomic properties that are missing |
||||
MOs <- MOs %>% |
||||
mutate(phylum = ifelse(phylum %in% c(NA, ""), "(unknown phylum)", phylum), |
||||
class = ifelse(class %in% c(NA, ""), "(unknown class)", class), |
||||
order = ifelse(order %in% c(NA, ""), "(unknown order)", order), |
||||
family = ifelse(family %in% c(NA, ""), "(unknown family)", family)) |
||||
|
||||
# Abbreviations ---- |
||||
# Add abbreviations so we can easily know which ones are which ones. |
||||
# These will become valid and unique microbial IDs for the AMR package. |
||||
MOs <- MOs %>% |
||||
arrange(kingdom, fullname) %>% |
||||
group_by(kingdom) %>% |
||||
mutate(abbr_other = case_when( |
||||
rank == "family" ~ paste0("[FAM]_", |
||||
abbreviate(family, |
||||
minlength = 8, |
||||
use.classes = TRUE, |
||||
method = "both.sides", |
||||
strict = FALSE)), |
||||
rank == "order" ~ paste0("[ORD]_", |
||||
abbreviate(order, |
||||
minlength = 8, |
||||
use.classes = TRUE, |
||||
method = "both.sides", |
||||
strict = FALSE)), |
||||
rank == "class" ~ paste0("[CLS]_", |
||||
abbreviate(class, |
||||
minlength = 8, |
||||
use.classes = TRUE, |
||||
method = "both.sides", |
||||
strict = FALSE)), |
||||
rank == "phylum" ~ paste0("[PHL]_", |
||||
abbreviate(phylum, |
||||
minlength = 8, |
||||
use.classes = TRUE, |
||||
method = "both.sides", |
||||
strict = FALSE)), |
||||
rank == "kingdom" ~ paste0("[KNG]_", kingdom), |
||||
TRUE ~ NA_character_ |
||||
)) %>% |
||||
# abbreviations determined per kingdom and family |
||||
# becuase they are part of the abbreviation |
||||
mutate(abbr_genus = abbreviate(genus, |
||||
minlength = 7, |
||||
use.classes = TRUE, |
||||
method = "both.sides", |
||||
strict = FALSE)) %>% |
||||
ungroup() %>% |
||||
group_by(genus) %>% |
||||
# species abbreviations may be the same between genera |
||||
# because the genus abbreviation is part of the abbreviation |
||||
mutate(abbr_species = abbreviate(stringr::str_to_title(species), |
||||
minlength = 3, |
||||
use.classes = FALSE, |
||||
method = "both.sides")) %>% |
||||
ungroup() %>% |
||||
group_by(genus, species) %>% |
||||
mutate(abbr_subspecies = abbreviate(stringr::str_to_title(subspecies), |
||||
minlength = 3, |
||||
use.classes = FALSE, |
||||
method = "both.sides")) %>% |
||||
ungroup() %>% |
||||
# remove trailing underscores |
||||
mutate(mo = gsub("_+$", "", |
||||
toupper(paste( |
||||
# first character: kingdom |
||||
ifelse(kingdom %in% c("Animalia", "Plantae"), |
||||
substr(kingdom, 1, 2), |
||||
substr(kingdom, 1, 1)), |
||||
# next: genus, species, subspecies |
||||
ifelse(is.na(abbr_other), |
||||
paste(abbr_genus, |
||||
abbr_species, |
||||
abbr_subspecies, |
||||
sep = "_"), |
||||
abbr_other), |
||||
sep = "_")))) %>% |
||||
mutate(mo = ifelse(duplicated(.$mo), |
||||
# these one or two must be unique too |
||||
paste0(mo, "1"), |
||||
mo), |
||||
fullname = ifelse(fullname == "", |
||||
trimws(paste(genus, species, subspecies)), |
||||
fullname)) %>% |
||||
# put `mo` in front, followed by the rest |
||||
select(mo, everything(), -abbr_other, -abbr_genus, -abbr_species, -abbr_subspecies) |
||||
|
||||
# add non-taxonomic entries |
||||
MOs <- MOs %>% |
||||
bind_rows( |
||||
# Unknowns |
||||
data.frame(mo = "UNKNOWN", |
||||
col_id = NA_integer_, |
||||
fullname = "(unknown name)", |
||||
kingdom = "(unknown kingdom)", |
||||
phylum = "(unknown phylum)", |
||||
class = "(unknown class)", |
||||
order = "(unknown order)", |
||||
family = "(unknown family)", |
||||
genus = "(unknown genus)", |
||||
species = "(unknown species)", |
||||
subspecies = "(unknown subspecies)", |
||||
rank = "(unknown rank)", |
||||
ref = NA_character_, |
||||
species_id = "", |
||||
source = "manually added", |
||||
stringsAsFactors = FALSE), |
||||
data.frame(mo = "B_GRAMN", |
||||
col_id = NA_integer_, |
||||
fullname = "(unknown Gram-negatives)", |
||||
kingdom = "Bacteria", |
||||
phylum = "(unknown phylum)", |
||||
class = "(unknown class)", |
||||
order = "(unknown order)", |
||||
family = "(unknown family)", |
||||
genus = "(unknown Gram-negatives)", |
||||
species = "(unknown species)", |
||||
subspecies = "(unknown subspecies)", |
||||
rank = "species", |
||||
ref = NA_character_, |
||||
species_id = "", |
||||
source = "manually added", |
||||
stringsAsFactors = FALSE), |
||||
data.frame(mo = "B_GRAMP", |
||||
col_id = NA_integer_, |
||||
fullname = "(unknown Gram-positives)", |
||||
kingdom = "Bacteria", |
||||
phylum = "(unknown phylum)", |
||||
class = "(unknown class)", |
||||
order = "(unknown order)", |
||||
family = "(unknown family)", |
||||
genus = "(unknown Gram-positives)", |
||||
species = "(unknown species)", |
||||
subspecies = "(unknown subspecies)", |
||||
rank = "species", |
||||
ref = NA_character_, |
||||
species_id = "", |
||||
source = "manually added", |
||||
stringsAsFactors = FALSE), |
||||
# CoNS |
||||
MOs %>% |
||||
filter(genus == "Staphylococcus", species == "") %>% .[1,] %>% |
||||
mutate(mo = paste(mo, "CNS", sep = "_"), |
||||
rank = "species", |
||||
col_id = NA_integer_, |
||||
species = "coagulase-negative", |
||||
fullname = "Coagulase-negative Staphylococcus (CoNS)", |
||||
ref = NA_character_, |
||||
species_id = "", |
||||
source = "manually added"), |
||||
# CoPS |
||||
MOs %>% |
||||
filter(genus == "Staphylococcus", species == "") %>% .[1,] %>% |
||||
mutate(mo = paste(mo, "CPS", sep = "_"), |
||||
rank = "species", |
||||
col_id = NA_integer_, |
||||
species = "coagulase-positive", |
||||
fullname = "Coagulase-positive Staphylococcus (CoPS)", |
||||
ref = NA_character_, |
||||
species_id = "", |
||||
source = "manually added"), |
||||
# Streptococci groups A, B, C, F, H, K |
||||
MOs %>% |
||||
filter(genus == "Streptococcus", species == "pyogenes") %>% .[1,] %>% |
||||
# we can keep all other details, since S. pyogenes is the only member of group A |
||||
mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRA", sep = "_"), |
||||
species = "group A" , |
||||
fullname = "Streptococcus group A"), |
||||
MOs %>% |
||||
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% |
||||
# we can keep all other details, since S. agalactiae is the only member of group B |
||||
mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRB", sep = "_"), |
||||
species = "group B" , |
||||
fullname = "Streptococcus group B"), |
||||
MOs %>% |
||||
filter(genus == "Streptococcus", species == "dysgalactiae") %>% .[1,] %>% |
||||
mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRC", sep = "_"), |
||||
col_id = NA_integer_, |
||||
species = "group C" , |
||||
fullname = "Streptococcus group C", |
||||
ref = NA_character_, |
||||
species_id = "", |
||||
source = "manually added"), |
||||
MOs %>% |
||||
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% |
||||
mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRD", sep = "_"), |
||||
col_id = NA_integer_, |
||||
species = "group D" , |
||||
fullname = "Streptococcus group D", |
||||
ref = NA_character_, |
||||
species_id = "", |
||||
source = "manually added"), |
||||
MOs %>% |
||||
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% |
||||
mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRF", sep = "_"), |
||||
col_id = NA_integer_, |
||||
species = "group F" , |
||||
fullname = "Streptococcus group F", |
||||
ref = NA_character_, |
||||
species_id = "", |
||||
source = "manually added"), |
||||
MOs %>% |
||||
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% |
||||
mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRG", sep = "_"), |
||||
col_id = NA_integer_, |
||||
species = "group G" , |
||||
fullname = "Streptococcus group G", |
||||
ref = NA_character_, |
||||
species_id = "", |
||||
source = "manually added"), |
||||
MOs %>% |
||||
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% |
||||
mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRH", sep = "_"), |
||||
col_id = NA_integer_, |
||||
species = "group H" , |
||||
fullname = "Streptococcus group H", |
||||
ref = NA_character_, |
||||
species_id = "", |
||||
source = "manually added"), |
||||
MOs %>% |
||||
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% |
||||
mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRK", sep = "_"), |
||||
col_id = NA_integer_, |
||||
species = "group K" , |
||||
fullname = "Streptococcus group K", |
||||
ref = NA_character_, |
||||
species_id = "", |
||||
source = "manually added"), |
||||
# Beta-haemolytic Streptococci |
||||
MOs %>% |
||||
filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% |
||||
mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "HAE", sep = "_"), |
||||
col_id = NA_integer_, |
||||
species = "beta-haemolytic" , |
||||
fullname = "Beta-haemolytic Streptococcus", |
||||
ref = NA_character_, |
||||
species_id = "", |
||||
source = "manually added") |
||||
) |
||||
|
||||
|
||||
# everything distinct? |
||||
sum(duplicated(MOs$mo)) |
||||
colnames(MOs) |
||||
|
||||
# set prevalence per species |
||||
MOs <- MOs %>% |
||||
mutate(prevalence = case_when( |
||||
class == "Gammaproteobacteria" |
||||
| genus %in% c("Enterococcus", "Staphylococcus", "Streptococcus") |
||||
| mo %in% c("UNKNOWN", "B_GRAMN", "B_GRAMP") |
||||
~ 1, |
||||
phylum %in% c("Proteobacteria", |
||||
"Firmicutes", |
||||
"Actinobacteria", |
||||
"Sarcomastigophora") |
||||
| genus %in% c("Aspergillus", |
||||
"Bacteroides", |
||||
"Candida", |
||||
"Capnocytophaga", |
||||
"Chryseobacterium", |
||||
"Cryptococcus", |
||||
"Elisabethkingia", |
||||
"Flavobacterium", |
||||
"Fusobacterium", |
||||
"Giardia", |
||||
"Leptotrichia", |
||||
"Mycoplasma", |
||||
"Prevotella", |
||||
"Rhodotorula", |
||||
"Treponema", |
||||
"Trichophyton", |
||||
"Trichomonas", |
||||
"Ureaplasma") |
||||
| rank %in% c("kingdom", "phylum", "class", "order", "family") |
||||
~ 2, |
||||
TRUE ~ 3 |
||||
)) |
||||
|
||||
# arrange |
||||
MOs <- MOs %>% arrange(fullname) |
||||
|
||||
# transform |
||||
MOs <- as.data.frame(MOs, stringsAsFactors = FALSE) |
||||
MOs.old <- as.data.frame(MOs.old, stringsAsFactors = FALSE) |
||||
class(MOs$mo) <- "mo" |
||||
MOs$col_id <- as.integer(MOs$col_id) |
||||
|
||||
# get differences in MO codes between this data and the package version |
||||
MO_diff <- AMR::microorganisms %>% |
||||
mutate(pastedtext = paste(mo, fullname)) %>% |
||||
filter(!pastedtext %in% (MOs %>% mutate(pastedtext = paste(mo, fullname)) %>% pull(pastedtext))) %>% |
||||
select(mo_old = mo, fullname, pastedtext) %>% |
||||
left_join(MOs %>% |
||||
transmute(mo_new = mo, fullname_new = fullname, pastedtext = paste(mo, fullname)), "pastedtext") %>% |
||||
select(mo_old, mo_new, fullname_new) |
||||
|
||||
mo_diff2 <- AMR::microorganisms %>% |
||||
select(mo, fullname) %>% |
||||
left_join(MOs %>% |
||||
select(mo, fullname), |
||||
by = "fullname", |
||||
suffix = c("_old", "_new")) %>% |
||||
filter(mo_old != mo_new, |
||||
#!mo_new %in% mo_old, |
||||
!mo_old %like% "\\[") |
||||
|
||||
mo_diff3 <- tibble(previous_old = names(AMR:::make_trans_tbl()), |
||||
previous_new = AMR:::make_trans_tbl()) %>% |
||||
left_join(AMR::microorganisms %>% select(mo, fullname), by = c(previous_new = "mo")) %>% |
||||
left_join(MOs %>% select(mo_new = mo, fullname), by = "fullname") |
||||
|
||||
# what did we win most? |
||||
MOs %>% filter(!fullname %in% AMR::microorganisms$fullname) %>% freq(genus) |
||||
# what did we lose most? |
||||
AMR::microorganisms %>% |
||||
filter(kingdom != "Chromista" & !fullname %in% MOs$fullname & !fullname %in% MOs.old$fullname) %>% |
||||
freq(genus) |
||||
|
||||
|
||||
# save |
||||
saveRDS(MOs, "microorganisms.rds") |
||||
saveRDS(MOs.old, "microorganisms.old.rds") |
||||
|
||||
# on the server, do: |
||||
usethis::use_data(microorganisms, overwrite = TRUE, version = 2) |
||||
usethis::use_data(microorganisms.old, overwrite = TRUE, version = 2) |
||||
rm(microorganisms) |
||||
rm(microorganisms.old) |
||||
|
||||
# TO DO AFTER THIS |
||||
# * Update the year and dim()s in R/data.R |
||||
# * Rerun data-raw/reproduction_of_rsi_translation.R |
||||
# * Run unit tests |
Binary file not shown.
Binary file not shown.