#########################################
# "Making word cloud graphs with ggplot"
#########################################
# This will make wordcloud graphs using ggworldcloud. In most cases it will work for any
# Other input youd like with some minor modifications

###  Read in libraries and arguments
# Arguments are as follows : 
#   **path_data** : path for cleaned WoS reference csv  
#   **path.plots** : folder path to save plots  
#   **path_topics** : folder of topics rds outputs  
#   **path_dict_tools** : folder of auxiliary functions for text mining
#   **data** : df with words (as term column) and frequencies (as beta column) by topic (as topic column)  
#   **values_tresh** : lower limit for frequencies of words  
#   **max_size_area** : to plot each wordcloud. Depends on number of words. If too large, there could be a lot of
#   space between wordclouds. If too small, some words may not appear in the wordcloud  
#   **height_file_par** and **width_file_par** : height and width of file where we save the plots (in inches)
#   filename : for the pdf with wordclouds  
#   **eccentricity_par** : proportion of horizontal display of wordcloud (respect to vertical). Default 1 (sphere or square like shape).  
#   **rm_outside_par** : removes text that would not be fitted. TRUE to avoid overlap. But try not to lose text. Default FALSE. 

# libraries
library(tidyverse)
library(topicmodels)
library(ggwordcloud)
library(tidytext) # to use unnest_tokens and tidy
library(textstem) # to use lemmatize_words
library(tm) # to use stopwords
library(stringr) # to use str_detect

# paths
path_data <- "./Data/Rocio-temporal/cleaned_papers_all_years_simple.csv"
path.plots <- "./Rocio/Plots/"
path_topics <- "./Data/Topics/"
path_dict_tools <- "./Data/Dictionary/AuxiliaryTextMining/"

# calling auxiliary functions
source("./R/Americanizing.R")
source("./R/cleaning_words_abstract.R")

# arguments
height_file_par = 30
width_file_par = 30
max_size_area = 30 
values_thresh = 0.003
rm_outside_par = FALSE
eccentricity_par = 1
filename <- paste0("wordcloud_II.pdf")

### Now read in data and prepare it

# main dataset

papers <- read.csv(file = paste0(path_data),stringsAsFactors = FALSE)

data_decade <- papers %>% 
  filter(pubyear > 2008 & pubyear < 2019)

data_decade_summ <- data_decade %>% select(doi,pubyear)

# LOAD LDA data
N_topics <- 15
alpha_par <- 1 #NULL
method_par <- "VEM" 

modk <- readRDS(file = paste0(path_topics,"NewBestTopicModel",N_topics,"_alpha_",alpha_par,"_method_",method_par,"_filtered_II.rds"))

############ NUMBER OF PAPERS RELATED TO EACH TOPIC #################

# this "gamma" part serves only to get the prevalence of papers and rank them,
# so that we use this ranking to order the wordclouds

papers_gamma <- tidytext::tidy(modk, matrix = "gamma")
head(papers_gamma)

# Each of these values is an estimated proportion of words from that document that are generated from that topic.
# For example, the model estimates that only about 12.3% of the words in document 2 were generated from topic 1.

# strength of total connection of topics to papers
gamma_topic <- papers_gamma %>% group_by(topic) %>% dplyr::summarise(gamma = sum(gamma)) %>% arrange(desc(gamma))
gamma_topic$percentage <- gamma_topic$gamma/length(unique(papers_gamma$document))*100
new_order <-  gamma_topic %>% select(topic)
new_label_for_order <- 1:N_topics
#
papers_gamma$topic_lab <- plyr::mapvalues(papers_gamma$topic, from = t(as.data.frame(new_order)), to = new_label_for_order)

papers_gamma_onlylab <- select(papers_gamma,-topic) # only reordered labels
papers_gamma_short <- spread(papers_gamma_onlylab, key = topic_lab, value = gamma)

##############################################################
# Create and save the word clouds of abstracts of each topic #
##############################################################

## Step 1: select papers that have > 0.75 gamma for a topic
## Step 2: join with abstracts
## Step 3: write a function to: for each set of abstracts (topic), 
## process words and compute frequencies
## the thing is that, word frequency in a topic is related to the number of abstracts related to the topic
## and we will compare them to topic wordclouds, which are related to the probability of the word existing in an abstract given a topic
## So the relative frequency will do: which is the number of times each unique word occurs in the topic divided by the number of total words in the topic
## Step 4: run function on each topic, get df from each one, and put them together
## Step 5: draw wordclouds
## 

## Step 1 and 2

papers_summ <- cbind.data.frame(doi = as.character(papers_gamma_short$document),
                                topic_max = apply(papers_gamma_short[,2:ncol(papers_gamma_short)],1,which.max),
                                gamma_max = apply(papers_gamma_short[,2:ncol(papers_gamma_short)],1,max))

papers_summ$top_75 <- NA 
ind_top_75 <- which(papers_summ$gamma_max > 0.75)
papers_summ$top_75[ind_top_75] <- papers_summ$topic_max[ind_top_75]

papers_75 <- papers_summ[!is.na(papers_summ$top_75),]

papers_summ_abstract_75 <- papers_75 %>% left_join(data_decade, by = "doi") %>% select(doi, top_75, abstract)

## Step 3
## 
## get a sample of abstracts: topic 1

word_freq_topic <- function(data, topic_num, path_dict_tools){
  
  # data : whole data.frame
  # topic_num : topic we will focus on
  # path_dict_tools : directory with auxiliary data files
  
  papers_topic <-  data %>% filter(top_75 == topic_num)
  
  # getting a list of clean words used by paper
  df_words <- cleaning_words_abstract(papers_topic, path_dict_tools)
  
  # now compute frequencies!
  word_freq <- df_words %>% dplyr:::count(word_am_lem)
  word_freq <- word_freq %>% mutate(prop = n/nrow(df_words))
  
  word_freq$topic <- rep(topic_num,nrow(word_freq))
  
  return(word_freq)
  
}

# Step 4
freq_word_top_df <- do.call(rbind.data.frame,lapply(1:N_topics,
                                                    function(x) word_freq_topic(papers_summ_abstract_75, 
                                                                                x, path_dict_tools)))
# this is computing relative freq by topic and putting it all in a large data frame

# Step 5

topic_sample <- freq_word_top_df %>% filter(prop > values_thresh) #%>%  select(term

plot_w <- ggplot(topic_sample, aes(label = word_am_lem, size = prop,  
                                  color = prop)) +
  geom_text_wordcloud_area(rm_outside = rm_outside_par, eccentricity = eccentricity_par,grid_margin = 0) + #area_corr_power = 1,
  scale_size_area(max_size = max_size_area) +
  theme_bw() + scale_colour_gradientn(colors=c('#253494','palegoldenrod','orangered'),values=c(0,.25,1))+
  facet_wrap(~topic, scales = "free",shrink = F) +
  theme(strip.text = element_text(size = 45),
        plot.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "pt"),
        strip.background = element_rect(fill = 'white')) 

ggsave(plot=plot_w,filename=filename, height=height_file_par, width = width_file_par, units = "in")