##################################### # Extracting species names from text # Using ITIS ###################################### ## This code connects to an already downloaded ITIS database and uses it to search # our papers for taxonomic names and matches those names to their taxa. # Load packages library(dplyr) library(dbplyr) library(RSQLite) library(cowplot) ############################# # Variables ############################# # Load taxonomy functions source("R/taxonomy_fun.R") # Load papers papers <- read.csv('Data/ProcessedQueries/References/cleaned_papers_all_years_simple.csv',stringsAsFactors = FALSE) # Connect to ITIS database path <- "/home/matt/Downloads/itisSqlite022618/" taxonomy <- DBI::dbConnect(RSQLite::SQLite(), paste0(path,"ITIS.sqlite")) # search variables # Our human search words (in rexpr) # Some words were not included because they were problematic # like human (ex human dominated landscape) # Tourist (ex tourist areas) # driver (obvious) humans <- c("players?", "patients?", "\\bchild\\b", "children", "teenagers?", "\\bpeople\\b", "students?", "fishermen", "\\bperson\\b", "tourists?", "visitors?", "\\bhunters?\\b", "customers?", "\\brunners?", "participants?", "\\bcyclists?", "employees?", "hikers?", "athletes?", "\\bboys?\\b", "girls?", "woman", "women", "\\bman\\b", "\\bmen\\b", "adolescents?") # Our problematic common names, exact match not reg ex. These match the database common vernacular. # And was built as we found more problematic names and words bad_names <- c('scales','whites','animals','scats','blues','coppers','Blues','fliers','grunts','scavengers', 'soles','spot','spots','sole', 'permit','permits','rainbow','rainbows','Whites','whites','white','ray','lion','Lion','Ray','Here','here','Mexico','mexico','delta','Delta', 'prominents','infant','Infant','purple','purples','emperors','emperor', 'underwings','underwing','Alabama','Alabama','Car', 'Goes','Corolla', 'corolla','Helix','helix','Planes','plane','Fota','brother','scat', 'Gyros','gyros','gyro','gyros','runners','cuckoos','Ego','Gila') #################################### # Start #################################### # Get table of taxonomic units tax_units <- tbl(taxonomy, "taxonomic_units") %>% collect() # Seperate titles, keywords, and abstracts papers$keywords[is.na(papers$keywords)]<-'' papers$extra_keywords[is.na(papers$extra_keywords)]<-'' titles <- papers$title keywords <- paste0(papers$keywords,'; ',papers$extra_keywords) abstracts <- papers$abstract # Only keep species level names in Animalia animalia <- tax_units %>% filter(kingdom_id==5) %>% filter(rank_id==220) %>% filter(name_usage=="valid") %>% collect() # Exclude humans animalia <- animalia[!animalia$complete_name == "Homo sapiens",] # Get vernaculars table vernaculars <- tbl(taxonomy, "vernaculars") # Select common names in English and tsn common_names <- vernaculars %>% filter(language=="English") %>% select(tsn, vernacular_name) %>% collect() # Join to table of scientific names by tsn animals <- left_join(animalia, common_names, by="tsn") %>% select(tsn, complete_name, vernacular_name, parent_tsn, rank_id) # Create the search vector to loop through. We're going to combine # gensu_species names as well as their common vernaculars. # Take out commonly bad common names. animals <- animals %>% filter(!vernacular_name %in% bad_names) complete <- animals$complete_name vernacular_name <- animals$vernacular_name # We're using regexpr here to make everything much easier search <- paste0('\\b',complete,'s?\\b|\\b',vernacular_name,'s?\\b') search[is.na(vernacular_name)] <- paste0('\\b',complete[is.na(vernacular_name)],'s?\\b') search <- gsub('ss\\?','s?', search) ids <- animals$tsn # Run the function! #list2 <- lapply(ids, search_function, data = ids) all_sn <- search_function(data = search, ids = ids) colnames(all_sn)[colnames(all_sn)=='id'] <- 'tsn' ############################## # None genus_species level names # These codes can be combined for simplicity # if necessary. # Select common names in English and tsn common_names <- vernaculars %>% filter(language=="English") %>% select(tsn, vernacular_name) %>% collect() high_taxa <- tax_units %>% filter(kingdom_id==5) %>% filter(rank_id!=220) %>% filter(name_usage=="valid") %>% collect() htaxa <- pull(high_taxa, complete_name) # Join to table of scientific names by tsn high <- left_join(high_taxa, common_names, by="tsn") %>% select(tsn, complete_name, vernacular_name, parent_tsn, rank_id) # Exclude some common names that are problematic high <- high %>% filter(!vernacular_name %in% bad_names) ids <- high$tsn complete <- high$complete_name vernacular_name <- high$vernacular_name search <- paste0('\\b',complete,'s?\\b|\\b',vernacular_name,'s?\\b') search[is.na(vernacular_name)] <- paste0('\\b',complete[is.na(vernacular_name)],'s?\\b') search <- gsub('ss\\?','s?', search) #list3 <- lapply(ids, search_function, data = ids) all_ht <- search_function(data = search[2300:3000], ids = ids) colnames(all_ht)[colnames(all_ht)=='id'] <- 'tsn' ################################ # Putting it all together # Species level all_sn$level <- 'genus_sp' all_ht$level <- 'higher' all_all <- bind_rows(all_sn, all_ht) %>% filter(!is.na(paper_id)) %>% distinct() taxa_tsn <- all_all %>% distinct(tsn) # We now need to derive the class and order of each name ll <- lapply(taxa_tsn$tsn[],getClassOrder) taxa_class_order <- as.data.frame(do.call(rbind,ll)) all_all <- merge(all_all,taxa_class_order, by='tsn',all.x=T) all_all <- merge(all_all,common_names, by='tsn',all.x=T) # Filter out duplicate rows all_all <- all_all %>% distinct(tsn, paper_id,section,level,.keep_all=T) all_all$doi2 <- papers$doi2[all_all$paper_id] # Write it out write.csv(all_all, 'animals_in_papers.csv',row.names = FALSE ) # human search # This is exponentially faster human_df <- search_function(data=humans, terms=T) human_df <- human_df[!is.na(human_df$paper_id), ] human_df$doi2 <- papers$doi2[human_df$paper_id] write.csv(human_df, 'humans_in_papers.csv',row.names = FALSE ) ##########