mirror of https://github.com/msberends/AMR
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
476 lines
20 KiB
476 lines
20 KiB
# ==================================================================== # |
|
# TITLE # |
|
# Antimicrobial Resistance (AMR) Data Analysis for R # |
|
# # |
|
# SOURCE # |
|
# https://github.com/msberends/AMR # |
|
# # |
|
# LICENCE # |
|
# (c) 2018-2021 Berends MS, Luz CF et al. # |
|
# Developed at the University of Groningen, the Netherlands, in # |
|
# collaboration with non-profit organisations Certe Medical # |
|
# Diagnostics & Advice, and University Medical Center Groningen. # |
|
# # |
|
# This R package is free software; you can freely use and distribute # |
|
# it for both personal and commercial purposes under the terms of the # |
|
# GNU General Public License version 2.0 (GNU GPL-2), as published by # |
|
# the Free Software Foundation. # |
|
# We created this package for both routine data analysis and academic # |
|
# research and it was publicly released in the hope that it will be # |
|
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. # |
|
# # |
|
# Visit our website for the full manual and a complete tutorial about # |
|
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ # |
|
# ==================================================================== # |
|
|
|
# Go to https://lpsn.dsmz.de/downloads (register first) and download the latest CSV file. |
|
|
|
file_location <- "data-raw/taxonomy.csv" |
|
|
|
library(tidyverse) |
|
library(AMR) |
|
|
|
# these should still work after this update |
|
test_fullname <- microorganisms$fullname |
|
test_mo <- microorganisms$mo |
|
|
|
|
|
# Helper functions -------------------------------------------------------- |
|
|
|
get_author_year <- function(ref) { |
|
# Only keep first author, e.g. transform 'Smith, Jones, 2011' to 'Smith et al., 2011' |
|
|
|
authors2 <- iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT") |
|
authors2 <- gsub(" ?\\(Approved Lists [0-9]+\\) ?", " () ", authors2) |
|
authors2 <- gsub(" [)(]+ $", "", authors2) |
|
# remove leading and trailing brackets |
|
authors2 <- trimws(gsub("^[(](.*)[)]$", "\\1", authors2)) |
|
# only take part after brackets if there's a name |
|
authors2 <- ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2), |
|
gsub(".*[)] (.*)", "\\1", authors2), |
|
authors2) |
|
# get year from last 4 digits |
|
lastyear = as.integer(gsub(".*([0-9]{4})$", "\\1", authors2)) |
|
# can never be later than now |
|
lastyear = ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")), |
|
NA, |
|
lastyear) |
|
# get authors without last year |
|
authors <- gsub("(.*)[0-9]{4}$", "\\1", authors2) |
|
# remove nonsense characters from names |
|
authors <- gsub("[^a-zA-Z,'& -]", "", authors) |
|
# remove trailing and leading spaces |
|
authors <- trimws(authors) |
|
# only keep first author and replace all others by 'et al' |
|
authors <- gsub("(,| and| et| &| ex| emend\\.?) .*", " et al.", authors) |
|
# et al. always with ending dot |
|
authors <- gsub(" et al\\.?", " et al.", authors) |
|
authors <- gsub(" ?,$", "", authors) |
|
# don't start with 'sensu' or 'ehrenb' |
|
authors <- gsub("^(sensu|Ehrenb.?) ", "", authors, ignore.case = TRUE) |
|
# no initials, only surname |
|
authors <- gsub("^([A-Z]+ )+", "", authors, ignore.case = FALSE) |
|
# combine author and year if year is available |
|
ref <- ifelse(!is.na(lastyear), |
|
paste0(authors, ", ", lastyear), |
|
authors) |
|
# fix beginning and ending |
|
ref <- gsub(", $", "", ref) |
|
ref <- gsub("^, ", "", ref) |
|
ref <- gsub("^(emend|et al.,?)", "", ref) |
|
ref <- trimws(ref) |
|
ref <- gsub("'", "", ref) |
|
|
|
# a lot start with a lowercase character - fix that |
|
ref[!grepl("^d[A-Z]", ref)] <- gsub("^([a-z])", "\\U\\1", ref[!grepl("^d[A-Z]", ref)], perl = TRUE) |
|
# specific one for the French that are named dOrbigny |
|
ref[grepl("^d[A-Z]", ref)] <- gsub("^d", "d'", ref[grepl("^d[A-Z]", ref)]) |
|
ref <- gsub(" +", " ", ref) |
|
ref |
|
} |
|
|
|
df_remove_nonASCII <- function(df) { |
|
# Remove non-ASCII characters (these are not allowed by CRAN) |
|
df %>% |
|
mutate_if(is.character, iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>% |
|
# also remove invalid characters |
|
mutate_if(is.character, ~gsub("[\"'`]+", "", .)) %>% |
|
AMR:::dataset_UTF8_to_ASCII() |
|
} |
|
|
|
abbreviate_mo <- function(x, minlength = 5, prefix = "", ...) { |
|
# keep a starting Latin ae |
|
suppressWarnings( |
|
gsub("^ae", "\u00E6\u00E6", x, ignore.case = TRUE) %>% |
|
abbreviate(minlength = minlength, |
|
use.classes = TRUE, |
|
method = "both.sides", ...) %>% |
|
paste0(prefix, .) %>% |
|
toupper() %>% |
|
gsub("(\u00C6|\u00E6)+", "AE", .) |
|
) |
|
} |
|
|
|
# Read data --------------------------------------------------------------- |
|
|
|
taxonomy <- read_csv(file_location) |
|
|
|
# Create synonyms --------------------------------------------------------- |
|
|
|
new_synonyms <- taxonomy %>% |
|
left_join(taxonomy, |
|
by = c("record_lnk" = "record_no"), |
|
suffix = c("", ".new")) %>% |
|
filter(!is.na(record_lnk)) %>% |
|
mutate_all(~ifelse(is.na(.), "", .)) %>% |
|
transmute(fullname = trimws(paste(genus_name, sp_epithet, subsp_epithet)), |
|
fullname_new = trimws(paste(genus_name.new, sp_epithet.new, subsp_epithet.new)), |
|
ref = get_author_year(authors), |
|
prevalence = 0) %>% |
|
distinct(fullname, .keep_all = TRUE) %>% |
|
filter(fullname != fullname_new) %>% |
|
# this part joins this table to itself to correct for entries that had >1 renames, |
|
# such as: |
|
# Bacteroides tectum -> Bacteroides tectus -> Bacteroides pyogenes |
|
left_join(., ., |
|
by = c("fullname_new" = "fullname"), |
|
suffix = c("", ".2")) %>% |
|
mutate(fullname_new = ifelse(!is.na(fullname_new.2), fullname_new.2, fullname_new), |
|
ref = ifelse(!is.na(ref.2), ref.2, ref)) %>% |
|
select(-ends_with(".2")) |
|
|
|
mo_became_synonym <- microorganisms %>% |
|
filter(fullname %in% new_synonyms$fullname) |
|
|
|
updated_microorganisms <- taxonomy %>% |
|
filter(is.na(record_lnk)) %>% |
|
mutate_all(~ifelse(is.na(.), "", .)) %>% |
|
transmute(mo = "", |
|
fullname = trimws(paste(genus_name, sp_epithet, subsp_epithet)), |
|
kingdom = "Bacteria", |
|
phylum = "", |
|
class = "", |
|
order = "", |
|
family = "", |
|
genus = trimws(genus_name), |
|
species = trimws(replace_na(sp_epithet, "")), |
|
subspecies = trimws(replace_na(subsp_epithet, "")), |
|
rank = case_when(subspecies == "" & species == "" ~ "genus", |
|
subspecies == "" ~ "species", |
|
TRUE ~ "subsp."), |
|
ref = get_author_year(authors), |
|
species_id = as.character(record_no), |
|
source = "LPSN", |
|
prevalence = 0, |
|
snomed = NA) |
|
|
|
new_microorganisms <- updated_microorganisms %>% |
|
filter(!fullname %in% microorganisms$fullname) |
|
|
|
genera_with_mo_code <- updated_microorganisms %>% |
|
filter(genus %in% (microorganisms %>% filter(kingdom == "Bacteria", rank == "genus") %>% pull(genus))) %>% |
|
distinct(genus) %>% |
|
left_join(microorganisms %>% filter(kingdom == "Bacteria", rank == "genus") %>% select(mo, genus), |
|
by = "genus") |
|
|
|
genera_without_mo_code <- updated_microorganisms %>% |
|
filter(!genus %in% genera_with_mo_code$genus) %>% |
|
pull(genus) %>% |
|
unique() |
|
|
|
genera_without_mo_code_abbr <- genera_without_mo_code %>% abbreviate_mo(5, prefix = "B_") |
|
genera_without_mo_code_abbr[genera_without_mo_code_abbr %in% microorganisms$mo] <- abbreviate_mo(genera_without_mo_code[genera_without_mo_code_abbr %in% microorganisms$mo], 6, prefix = "B_") |
|
genera_without_mo_code_abbr[genera_without_mo_code_abbr %in% microorganisms$mo] <- abbreviate_mo(genera_without_mo_code[genera_without_mo_code_abbr %in% microorganisms$mo], 7, prefix = "B_") |
|
# all unique?? |
|
sum(genera_without_mo_code_abbr %in% microorganisms$mo) == 0 |
|
|
|
genus_abb <- tibble(genus = genera_without_mo_code, |
|
abbr = genera_without_mo_code_abbr) %>% |
|
bind_rows(microorganisms %>% |
|
filter(kingdom == "Bacteria", rank == "genus", !genus %in% genera_without_mo_code) %>% |
|
transmute(genus, abbr = as.character(mo))) %>% |
|
arrange(genus) |
|
|
|
|
|
# Update taxonomy --------------------------------------------------------- |
|
|
|
# fill in the taxonomy of new genera |
|
updated_taxonomy <- tibble(phylum = character(0), |
|
class = character(0), |
|
order = character(0), |
|
family = character(0), |
|
genus = character(0)) |
|
for (page in LETTERS) { |
|
message("Downloading page ", page, "... ", appendLF = FALSE) |
|
url <- paste0("https://lpsn.dsmz.de/genus?page=", page) |
|
|
|
x <- xml2::read_html(url) %>% |
|
rvest::html_node(".main-list") %>% |
|
# evety list element with a set <id> attribute |
|
rvest::html_nodes("li[id]") |
|
for (i in seq_len(length(x))) { |
|
txt <- x %>% |
|
magrittr::extract2(i) %>% |
|
rvest::html_text() %>% |
|
gsub("\\[[A-Za-z]+, no [a-z]+\\]", "NA", .) %>% |
|
gsub("Candidatus ", "", ., fixed = TRUE) %>% |
|
gsub("[ \t\r\n\"]+", "|", .) %>% |
|
gsub("\\|ShowHide.*", "", .) %>% |
|
gsub("[\\[\\]]", "", ., fixed = TRUE) %>% |
|
gsub("^\\|", "", .) %>% |
|
strsplit("|", fixed = TRUE) %>% |
|
unlist() |
|
txt[txt == "NA"] <- "" |
|
txt <- gsub("[^A-Za-z]+", "", txt) |
|
updated_taxonomy <- updated_taxonomy %>% |
|
bind_rows(tibble(phylum = txt[2], |
|
class = txt[3], |
|
order = txt[4], |
|
family = txt[5], |
|
genus = txt[6])) |
|
} |
|
message(length(x), " entries (total ", nrow(updated_taxonomy), ")") |
|
} |
|
|
|
# Create new microorganisms ----------------------------------------------- |
|
|
|
new_microorganisms <- new_microorganisms %>% |
|
left_join(genus_abb, by = "genus") %>% |
|
group_by(genus) %>% |
|
mutate(species_abb = abbreviate_mo(species, 4)) %>% |
|
group_by(genus, species) %>% |
|
mutate(subspecies_abb = abbreviate_mo(subspecies, 4)) %>% |
|
ungroup() %>% |
|
mutate(mo = paste(abbr, species_abb, subspecies_abb, sep = "_"), |
|
mo = gsub("_+$", "", mo)) %>% |
|
select(-matches("abb")) |
|
|
|
# add taxonomy new microorganisms |
|
MOs <- microorganisms %>% |
|
mutate(mo = as.character(mo)) %>% |
|
bind_rows(new_microorganisms) %>% |
|
arrange(fullname) |
|
|
|
# unique MO codes |
|
MOs$mo[which(duplicated(MOs$mo))] <- paste0(MOs$mo[which(duplicated(MOs$mo))], 1) |
|
# all unique? |
|
!any(duplicated(MOs$mo)) |
|
|
|
MOs <- MOs %>% |
|
# remove entries that are now a synonym |
|
filter(!fullname %in% new_synonyms$fullname) %>% |
|
# update the taxonomy |
|
left_join(updated_taxonomy, by = "genus", suffix = c("", ".new")) %>% |
|
mutate(phylum = ifelse(!is.na(phylum.new), phylum.new, phylum), |
|
class = ifelse(!is.na(class.new), class.new, class), |
|
order = ifelse(!is.na(order.new), order.new, order), |
|
family = ifelse(!is.na(family.new), family.new, family)) %>% |
|
select(-ends_with(".new")) %>% |
|
# update prevalence based on taxonomy (Berends et al., 2021) |
|
mutate(prevalence = case_when( |
|
class == "Gammaproteobacteria" |
|
| genus %in% c("Enterococcus", "Staphylococcus", "Streptococcus") |
|
~ 1, |
|
kingdom %in% c("Archaea", "Bacteria", "Chromista", "Fungi") |
|
& (phylum %in% c("Proteobacteria", |
|
"Firmicutes", |
|
"Actinobacteria", |
|
"Sarcomastigophora") |
|
| genus %in% c("Absidia", "Acremonium", "Actinotignum", "Alternaria", "Anaerosalibacter", "Apophysomyces", |
|
"Arachnia", "Aspergillus", "Aureobacterium", "Aureobasidium", "Bacteroides", "Basidiobolus", |
|
"Beauveria", "Blastocystis", "Branhamella", "Calymmatobacterium", "Candida", "Capnocytophaga", |
|
"Catabacter", "Chaetomium", "Chryseobacterium", "Chryseomonas", "Chrysonilia", "Cladophialophora", |
|
"Cladosporium", "Conidiobolus", "Cryptococcus", "Curvularia", "Exophiala", "Exserohilum", |
|
"Flavobacterium", "Fonsecaea", "Fusarium", "Fusobacterium", "Hendersonula", "Hypomyces", |
|
"Koserella", "Lelliottia", "Leptosphaeria", "Leptotrichia", "Malassezia", "Malbranchea", |
|
"Mortierella", "Mucor", "Mycocentrospora", "Mycoplasma", "Nectria", "Ochroconis", |
|
"Oidiodendron", "Phoma", "Piedraia", "Pithomyces", "Pityrosporum", "Prevotella", "Pseudallescheria", |
|
"Rhizomucor", "Rhizopus", "Rhodotorula", "Scolecobasidium", "Scopulariopsis", "Scytalidium", |
|
"Sporobolomyces", "Stachybotrys", "Stomatococcus", "Treponema", "Trichoderma", "Trichophyton", |
|
"Trichosporon", "Tritirachium", "Ureaplasma") |
|
| rank %in% c("kingdom", "phylum", "class", "order", "family")) |
|
~ 2, |
|
TRUE ~ 3 |
|
)) |
|
|
|
# add all mssing genera, families and orders |
|
MOs <- MOs %>% |
|
bind_rows( |
|
MOs %>% |
|
arrange(genus, species) %>% |
|
distinct(genus, .keep_all = TRUE) %>% |
|
filter(rank == "species", source != "manually added") %>% |
|
mutate(mo = gsub("^([A-Z]_[A-Z]+)_.*", "\\1", mo), |
|
fullname = genus, |
|
species = "", |
|
subspecies = "", |
|
rank = "genus", |
|
species_id = "", |
|
snomed = NA, |
|
ref = NA_character_), |
|
MOs %>% |
|
group_by(family) %>% |
|
filter(!any(rank == "family") & n() > 1) %>% |
|
ungroup() %>% |
|
arrange(family) %>% |
|
distinct(family, .keep_all = TRUE) %>% |
|
filter(!family %in% c("", NA), source != "manually added") %>% |
|
mutate(mo = paste0(substr(kingdom, 1, 1), "_[FAM]_", |
|
abbreviate(family, |
|
minlength = 8, |
|
use.classes = TRUE, |
|
method = "both.sides", |
|
strict = FALSE)), |
|
mo = toupper(mo), |
|
fullname = family, |
|
genus = "", |
|
species = "", |
|
subspecies = "", |
|
rank = "family", |
|
species_id = "", |
|
snomed = NA, |
|
ref = NA_character_), |
|
MOs %>% |
|
group_by(order) %>% |
|
filter(!any(rank == "order") & n() > 1) %>% |
|
ungroup() %>% |
|
arrange(order) %>% |
|
distinct(order, .keep_all = TRUE) %>% |
|
filter(!order %in% c("", NA), source != "manually added") %>% |
|
mutate(mo = paste0(substr(kingdom, 1, 1), "_[ORD]_", |
|
abbreviate(order, |
|
minlength = 8, |
|
use.classes = TRUE, |
|
method = "both.sides", |
|
strict = FALSE)), |
|
mo = toupper(mo), |
|
fullname = order, |
|
family = "", |
|
genus = "", |
|
species = "", |
|
subspecies = "", |
|
rank = "order", |
|
species_id = "", |
|
snomed = NA, |
|
ref = NA_character_) |
|
) %>% |
|
arrange(fullname) |
|
|
|
# clean up |
|
MOs <- MOs %>% |
|
df_remove_nonASCII() |
|
|
|
# Add LPSN record IDs ----------------------------------------------------- |
|
|
|
records_ids <- taxonomy %>% |
|
mutate(across(1:3, function(x) { x[is.na(x)] <- ""; x}), |
|
fullname = trimws(paste(genus_name, sp_epithet, subsp_epithet))) %>% |
|
transmute(fullname, species_id = as.numeric(record_no)) %>% |
|
arrange(fullname, species_id) %>% |
|
distinct(fullname, .keep_all = TRUE) |
|
message("Adding ", sum(records_ids$fullname %in% microorganisms$fullname), " LPSN record IDs") |
|
MOs <- MOs %>% |
|
select(-species_id) %>% |
|
left_join(records_ids, by = "fullname") %>% |
|
relocate(species_id, .after = ref) %>% |
|
mutate(source = case_when(!is.na(species_id) ~ "LPSN", |
|
source %unlike% "manual" ~ "CoL", |
|
TRUE ~ source)) |
|
|
|
# Merge synonyms ---------------------------------------------------------- |
|
|
|
# remove synonyms that are now valid names |
|
MOs.old <- microorganisms.old %>% |
|
# add new synonyms |
|
bind_rows(new_synonyms) %>% |
|
filter(!fullname %in% MOs$fullname) %>% |
|
arrange(fullname) %>% |
|
distinct(fullname, fullname_new, .keep_all = TRUE) %>% |
|
# add prevalence to old taxonomic names |
|
select(-prevalence) %>% |
|
left_join(MOs %>% select(fullname, prevalence), by = c("fullname_new" = "fullname")) %>% |
|
# clean up |
|
df_remove_nonASCII() |
|
|
|
message("microorganisms new: ", sum(!MOs$fullname %in% c(microorganisms$fullname, MOs.old$fullname))) |
|
message("microorganisms renamed: ", sum(!MOs.old$fullname %in% microorganisms.old$fullname)) |
|
|
|
|
|
# Save -------------------------------------------------------------------- |
|
|
|
# class <mo> |
|
class(MOs$mo) <- c("mo", "character") |
|
|
|
microorganisms <- MOs |
|
microorganisms.old <- MOs.old |
|
|
|
# --- Moraxella catarrhalis was named Branhamella catarrhalis (Catlin, 1970), but this is unaccepted in clinical microbiology |
|
# we keep them both |
|
microorganisms <- microorganisms %>% |
|
bind_rows(microorganisms %>% |
|
filter(fullname == "Branhamella catarrhalis") %>% |
|
mutate(mo = "B_MRXLL_CTRR", |
|
fullname = "Moraxella catarrhalis", |
|
genus = "Moraxella", |
|
ref = "Henriksen et al., 1968", |
|
species_id = "a374f6f0868e05f9c0f5077b60ee0a6c", |
|
snomed = as.list(24226003))) %>% |
|
arrange(fullname) %>% |
|
df_remove_nonASCII() |
|
microorganisms.old <- microorganisms.old %>% |
|
filter(fullname != "Moraxella catarrhalis") |
|
# --- |
|
|
|
# (this would be a great moment to run data-raw/snomed.R as well) |
|
|
|
# on the server, do: |
|
usethis::use_data(microorganisms, overwrite = TRUE, version = 2, compress = "xz") |
|
usethis::use_data(microorganisms.old, overwrite = TRUE, version = 2, compress = "xz") |
|
rm(microorganisms) |
|
rm(microorganisms.old) |
|
|
|
# DON'T FORGET TO UPDATE R/globals.R! |
|
|
|
# load new data sets |
|
devtools::load_all(".") |
|
|
|
# reset previously changed mo codes |
|
rsi_translation$mo <- as.mo(rsi_translation$mo, language = NULL) |
|
usethis::use_data(rsi_translation, overwrite = TRUE, version = 2, compress = "xz") |
|
rm(rsi_translation) |
|
|
|
microorganisms.codes$mo <- as.mo(microorganisms.codes$mo, language = NULL) |
|
# new NAs introduced? |
|
any(is.na(microorganisms.codes$mo)) |
|
usethis::use_data(microorganisms.codes, overwrite = TRUE, version = 2, compress = "xz") |
|
rm(microorganisms.codes) |
|
|
|
example_isolates$mo <- as.mo(example_isolates$mo, language = NULL) |
|
usethis::use_data(example_isolates, overwrite = TRUE, version = 2) |
|
rm(example_isolates) |
|
|
|
intrinsic_resistant$microorganism <- suppressMessages(mo_name(intrinsic_resistant$microorganism)) |
|
usethis::use_data(intrinsic_resistant, overwrite = TRUE, version = 2) |
|
rm(intrinsic_resistant) |
|
|
|
# load new data sets again |
|
devtools::load_all(".") |
|
source("data-raw/_internals.R") |
|
devtools::load_all(".") |
|
|
|
|
|
# Test updates ------------------------------------------------------------ |
|
|
|
# and check: these codes should not be missing (will otherwise throw a unit test error): |
|
AMR::microorganisms.codes %>% filter(!mo %in% MOs$mo) |
|
AMR::rsi_translation %>% filter(!mo %in% MOs$mo) |
|
AMR:::microorganisms.translation %>% filter(!mo_new %in% MOs$mo) |
|
AMR::example_isolates %>% filter(!mo %in% MOs$mo) |
|
|
|
# Don't forget to add SNOMED codes! (data-raw/snomed.R) |
|
|
|
# run the unit tests |
|
Sys.setenv(NOT_CRAN = "true") |
|
testthat::test_file("tests/testthat/test-data.R") |
|
testthat::test_file("tests/testthat/test-mo.R") |
|
testthat::test_file("tests/testthat/test-mo_property.R")
|
|
|