# PATHS
path_data <- "./Data/ProcessedQueries/References/" #path to the file with the summary information of the papers, from the queries of WoK and processed
# that file is called "cleaned_papers_all_years_simple.csv"
path_dict_tools <- "./Data/Dictionary/AuxiliaryTextMining/" # path to some auxiliary files for text mining

###########

# LIBRARIES
library(tidyverse)
library(tidytext)
library(stringr)
library(tm) # removing words in Spanish
# if (!require("pacman")) install.packages("pacman")
# pacman::p_load_gh("trinker/textstem")
library(textstem) # lemmatizing # to install textstem, error when installing statnet.common because my R version is 3.4, so had to work with 4.1.4 statnet.common

# calling auxiliary functions
source("./R/Americanizing.R")
source("./R/cleaning_words_abstract.R")

############

papers <- read.csv(paste0(path_data,"cleaned_papers_all_years_simple.csv"), stringsAsFactors = FALSE)

datosOriginales <- papers %>% 
  filter(pubyear > 2008 & pubyear < 2019)

# DATA OF INTEREST --------------------------------------------------------

# 1.Filter columns of interest
test <- datosOriginales %>% 
  mutate(paper = doi) %>% 
  select(paper, abstract) 

# now using functions to clean abstracts:
df_words <- cleaning_words_abstract(test, path_dict_tools)

# let's how many of each there are (so maybe filter the very infrequent ones out - the ones that appear once)
word_freq <- df_words %>% count(word_am_lem)
word_freq <- word_freq %>% mutate(prop = n/sum(n))

terms_extract <- data.frame(word_am_lem = 
                              word_freq$word_am_lem[which(word_freq$prop < round(min(word_freq$prop),10) + 10^{-10})],
                            stringsAsFactors = FALSE)
df_words <- df_words %>% 
  anti_join(terms_extract)


# FINALLY
# TIDY DATA
datosFinales <- df_words %>% 
  select(paper, word_am_lem)
names(datosFinales) <- c("paper","word")


# datosFinales <- datos3 %>% 
#   select(paper, word)

# Save data in the project
write.csv(x = datosFinales, file = paste0(path_data,"cleaned_filtered_TidyData_TopicModeling.csv"), row.names = FALSE)