library(tidyverse)
library(RColorBrewer)

# Arguments
path <- "./Data/ProcessedQueries/References/"
# path.plots <- "./Rocio/Plots/"
path_processed_dictionaries <- "./Data/Dictionary/Papers-Term/"
path_dictionary_info <- "./Data/Dictionary/"

source("R/methods_cat_analysis.R")

papers <- read.csv(file = paste0(path,"cleaned_papers_all_years_simple.csv"),stringsAsFactors = FALSE)

data_decade <- papers %>% 
  filter(pubyear > 2008 & pubyear < 2019)


dictionary <- "Methods"

load(paste0(path_processed_dictionaries,"paper-term-dictionary-",dictionary,".RData"))
# both matrices have the keywords ordered alphabetically:

total_useful_papers <- apply(matrix_CatTerm[2:ncol(matrix_CatTerm)],1,sum)
matrix_CatTerm <- matrix_CatTerm[total_useful_papers>0,]

keywords <- unlist(lapply(strsplit(colnames(matrix_CatTerm), " : ")[-1],"[[",2))
colnames(matrix_CatTerm) <- c("doi",keywords)
rownames(matrix_CatTerm) <- matrix_CatTerm$doi

matrix_CatTerm <-
  matrix_CatTerm[,-1] %>%
  select(sort(colnames(.))) # sort columns by alphabetical order

synonyms_keywords <- read_csv(paste0(path_dictionary_info,"Synonyms-Methods.csv"))

res <- methods_cat_analysis(matrix_CatTerm = matrix_CatTerm, synonyms_keywords, col_cat = 3, data_decade)
## print(res[[1]]) gives the output of the Table "Percentage of papers using each type of statistical method."

# Internally, I'm filtering out tests and other stuff:
filter_out_methods <- unique(c(which(synonyms_keywords$meaning2 == "test" | 
    synonyms_keywords$meaning2 == "model selection" | 
    synonyms_keywords$meaning2 == "simulation" |
    is.na(synonyms_keywords$meaning2) == TRUE |
    synonyms_keywords$meaning3 == "other"),
  grep("likelihood",synonyms_keywords$keyword)))
synonyms_keywords <- as.data.frame(synonyms_keywords[-filter_out_methods,])

col_cat=3
ext_subcat <- unique(synonyms_keywords[ , col_cat + 1])

matrix_Ext_SubCat <- matrix(0,ncol=length(ext_subcat),nrow=dim(matrix_CatTerm))
colnames(matrix_Ext_SubCat) <- ext_subcat
rownames(matrix_Ext_SubCat) <- rownames(matrix_CatTerm)

for (i in 1:length(ext_subcat)){ # for each subcategory
  terms_ext <- synonyms_keywords$keyword[which(synonyms_keywords[ , col_cat + 1] == ext_subcat[i])] # which keywords correspond to the subcategory
  col_matrix <- which(colnames(matrix_CatTerm) %in% terms_ext) # which columns in matrix_CatTerm does that represent
  # now check with rows have at least one "1" among those categories
  # first sum by row. it should be >= 1
  ind_matrix <- which(apply(matrix_CatTerm[,col_matrix],1,sum) > 0)
  matrix_Ext_SubCat[ind_matrix,i] <- 1
}


total_ext <- sum(rowSums(matrix_Ext_SubCat)>0)

table_sub <- sort(round(apply(matrix_Ext_SubCat,2,sum)/total_ext*100,1),decreasing = TRUE)

# Now, join with year

df_Ext_SubCat <- as.data.frame(matrix_Ext_SubCat) %>% 
  mutate(doi = rownames(matrix_Ext_SubCat))

joined_df <- df_Ext_SubCat %>% left_join(data_decade, by = "doi") %>% select(ext_subcat,pubyear)

# And counting by category by year

joined_df_year <- joined_df %>% group_by(pubyear) %>% summarise_all(sum)

joined_df_nozero <- joined_df[apply(joined_df[,ext_subcat],1,sum) > 0,]

sum_year <- joined_df_nozero %>% group_by(pubyear) %>% tally()

joined_df_prop_year <- joined_df_year[,ext_subcat]/matrix(rep(sum_year$n,each=length(ext_subcat)),ncol=length(ext_subcat),byrow=TRUE)

joined_df_prop_year <- joined_df_prop_year %>% mutate(year = joined_df_year$pubyear) 

df_plot_prop <- joined_df_prop_year %>% gather(key = subcategory, value = prop_papers, -year)

plot_df <- df_plot_prop
head(plot_df)
plot_df$subcategory <- rep(c('Generic','Spatial','Movement','Time-series','Social','Spatial-temporal'), each = 10)

# Run a quick linear model to measure which trend lines are positive or negative
# we'll reference this when we choose our colors
here <- by(plot_df, plot_df$subcategory, function(x)
  lm(x$prop_papers ~ x$year)$coefficients[2]
)
plot_df$subcategory <- factor(plot_df$subcategory, levels= names(sort(here)))
# Create a grouping variable based on this value

grouping <- data.frame(subcategory = c(names(here)[here<=0.003 & here>=(-0.003)],names(here)[here<(-0.003)],names(here)[here>0.003]))
grouping$group <- seq_along(grouping$subcategory)

plot_df <- merge(plot_df,grouping, by='subcategory')

# Now to make our aesthetic features which will be added with scale_*_manual()
# Colors
# Make a color ramp where the amount of 'grays' will determine the highlighted categories
Tol_muted <- c('#88CCEE', '#44AA99', '#117733', '#332288', '#DDCC77', '#999933','#CC6677', '#882255', '#AA4499', '#DDDDDD')

#Okabe_Ito <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#000000")
colorz <- Tol_muted[1:(length(here)) %% (length(Tol_muted))]
# change problematic colors to gray60
# gray 60 "#7f7f7f"
# black "#000000"
names(colorz) <- names(sort(here))
colorz
colorz[6] <- "#7f7f7f"


# line types
# just need to spread linetypes out enough so that the color and alpha can help distinguish as well
# manual 5 = dash, 3 = dotted, 1 = solid
linetypez <- c(5,3,3,3,3,1)
# or random
# linetypez <- rep(1:6,times=ceiling(length(levels(plot_df$Topic))/6))
# linetypez <- linetypez[seq_along(levels(plot_df$Topic))]
names(linetypez) <- names(sort(here))

# alpha
# Changing alpha will help to make the important categories pop.
# Create a gradient of alphas from 1 -> .2 -> so none trend lines are grayed out.
nz <- length(here)
# automatically
#alphaz <- c((1*nz/2):(.2*nz/2)/nz*2,(.2*nz/2):(1*nz/2)/nz*2,ifelse(nz%%2==0,NULL,1))
# or manually

alphaz <- c(1,.7,.4,.4,.4,1)
names(alphaz) <- names(sort(here))

# line width
sizez <-  c(2,1,1,1,1,2)
#names(sizez) <- names(here)
sizez
sizez <- rep(sizez, each = 10)
sizez
# You have to include color, linetype, and alpha in the mapping even if youre going to override it anyway.


p <- ggplot(
  data  = plot_df) +
  geom_line(size=1.5, 
            mapping = aes(x = year, y = prop_papers, color = subcategory, group = group, linetype = subcategory, alpha = subcategory)
  ) +
  scale_color_manual(name='Methods',values = colorz) +
  scale_linetype_manual(name='Methods',values = linetypez) +
  scale_alpha_manual(name='Methods',values = alphaz)+
  theme_classic()+xlab("") + ylab("Proportion of articles in a year") +
  theme(axis.text.x = element_text(angle = 15, hjust = 1,size=16),axis.text.y = element_text(size=16),
        legend.position = "none", legend.justification = "right",legend.text=element_text(size=15),
        axis.title.y = element_text(margin = margin(r=10),size=17), 
        axis.title.x = element_text(margin = margin(t=10)),
        legend.key.size = unit(2,"line"),
        legend.title=element_text(size=16))

start_pos <- plot_df %>% group_by(subcategory) %>% summarise(y = last(prop_papers)) %>% mutate(x = 2018)
start_pos$colorz <- colorz
start_pos

start_pos$x_new <- start_pos$x + 0.1
start_pos$y_new <- start_pos$y + c(0,0,0,0,0,0)
p + geom_text(data = start_pos, aes(x =x_new ,y=y_new, label = subcategory), color=colorz,hjust=0,size=5)+
  coord_cartesian(xlim = c(2009, 2018),clip = 'off') + 
  theme(plot.margin = unit(c(1,10,1,1), "lines"))

# ggsave("Manuscript/Images/method_ts1.png", width=12,height=8)