#Loading in packages & data
library(tidyverse)
library(tm)
ReferenceList <- read_csv("BibliometricAnalysis.csv")





###Summary of 'Publication Year' variable
sort(table( ReferenceList$"Publication Year" ),decreasing = T) #how recent citations are
summary( ReferenceList$"Publication Year" ) #how recent citations are
ReferenceList[ReferenceList$"Publication Year" == 1900,] #the oldest cited article
ggplot( ReferenceList, aes( as.numeric( ReferenceList$'Publication Year' ) ) ) + geom_histogram() #graphical summary





###Most frequently cited authors and journals
##Removing all punctuation and authors beyond the first from author variables so they can be compared
#This has been replaced with piping since the first submission
FirstAuthor <- ReferenceList$Author %>% 
  sub("\\,.*", "",.) %>% #replacing 
  sub("\\;.*", "",.) %>% 
  iconv(to='ASCII//TRANSLIT') %>% 
  removePunctuation() %>% 
  tolower()

#Most cited authors/author organizations
FirstAuthor %>% 
  table() %>% 
  sort(decreasing = T) %>% 
  head(n = 20)
#ignore webb because there are 2 webbs in the data set (this was done manually)

sum(table(FirstAuthor) > 10) - 5
#similarly remove allen, lee, schafer, smith, wang as there were many authors with these names, none of which 
#had more than 10 publications cited (also done manually)
#(leave in Webb, because one of the two Webbs had more than 10 publucations)

#Most Common Publication for most popular Authors (this is the 10 most common authors, and the most cited organization)
#These were sifted through manually to ensure that similar titles were collapsed
#(e.g.,  Kane had ' Validating the Interpretations and Uses of Test Scores' and 'VALIDATING THE INTERPRETATIONS AND USES OF TEST SCORES'
#R sees these as different strings, even though the difference is just due to how metadata was reported.)
MostCommonTitle <- cbind(ReferenceList,FirstAuthor)

#matching most common authors with their publications
table((MostCommonTitle$Title)[FirstAuthor == 'kane'])
table((MostCommonTitle$Title)[FirstAuthor == 'abedi'])
table((MostCommonTitle$Title)[FirstAuthor == 'hambleton'])
table((MostCommonTitle$Title)[FirstAuthor == 'mattern'])
table((MostCommonTitle$Title)[FirstAuthor == 'cizek'])
table((MostCommonTitle$Title)[FirstAuthor == 'brennan'])
table((MostCommonTitle$Title)[FirstAuthor == 'linn'])
table((MostCommonTitle$Title)[FirstAuthor == 'sackett'])
table((MostCommonTitle$Title)[FirstAuthor == 'cohen'])
table((MostCommonTitle$Title)[FirstAuthor == 'raymond'])

#organization as author
table((MostCommonTitle$Title)[FirstAuthor == 'american educational research association'])





###Analysis of Journals cited
sort(table(ReferenceList$'Item Type'),decreasing=T)
JournalArticles <- ReferenceList[ReferenceList$'Item Type' == "journalArticle",]
head(sort(table(JournalArticles$'Publication Title'), decreasing = T), n = 10) #Ten most cited journals
unique(JournalArticles$`Publication Title`) #number of journals cited
sum(table(JournalArticles$`Publication Title`) > 10) #number of journals that were cited 10 or more times





###Analysis of Journal Articles cited
#Similar to author names, standardizing article names into "compareTitle" which is made for comparing the journal article titles
compareJournalTitle <- ReferenceList$Title %>% 
  subset(ReferenceList$`Item Type` == "journalArticle") %>% 
  iconv(to = 'ASCII//TRANSLIT') %>% 
  removePunctuation() %>% 
  tolower()

#The thirty most cited journal articles
compareJournalTitle %>% 
  table() %>% 
  sort(decreasing = T) %>% 
  head(n = 30)

#Number of cited journal articles
compareJournalTitleTable <- data.frame(table(compareJournalTitle))
nrow(compareJournalTitleTable)

#Number of articles that were cited once
sum(compareJournalTitleTable[,2] == 1)





###The most cited Book. Many iterations of the title were tested
sum(ReferenceList$`Publication Title` == 'Educational Measurement' | ReferenceList$`Publication Title` == 'Educational measurement', na.rm = TRUE) + 
  sum(ReferenceList$`Title` == 'Educational Measurement' | ReferenceList$`Title` == 'Educational measurement', na.rm = TRUE)





###Analysis of more recent developments such as webpages and computer programs
webPages <- ReferenceList[ReferenceList$'Item Type' == "webpage",]
table(webPages$`Publication Year`)

table(computerProgram$`Publication Year`)
computerProgram <- ReferenceList[ReferenceList$'Item Type' == "computerProgram",]





###Analysis of reports cited
Reports <- ReferenceList[ReferenceList$'Item Type' == "report",]
head(sort(table(Reports$'Author'), decreasing = T), n = 10) #who wrote the most reports that were cited

compareReport <- ReferenceList$Title %>% 
  subset(ReferenceList$`Item Type` == "report") %>% 
  iconv(to = 'ASCII//TRANSLIT') %>% 
  removePunctuation() %>% 
  tolower()

#The thirty most cited reports
compareReport %>% 
  table() %>% 
  sort(decreasing = T) %>% 
  head(n = 30)

#Number of unique reports cited
compareReportTable <- data.frame(table(compareReport))
nrow(compareReportTable)

#Number of reports cited just once
sum(compareReportTable[,2] == 1)





########
####Extraneous Code (This analysis is not included in the final paper)
########

### Summary of Publication Type by Year
##This section is just looking at the frequency of which publication types came from which years
rowSums( PubTypeByYear )

#This section just turns the table of the proportion of citations from each year that are a certain type ()
PubTypeByYear <- table( ReferenceList$`Item Type`, ReferenceList$`Publication Year`)
PropPubTypeByYear <- prop.table( PubTypeByYear, 2 )
names <- rownames( PropPubTypeByYear )
rownames( PropPubTypeByYear ) <- NULL
PropPubTypeByYear2 <- as.data.frame( cbind( names,PropPubTypeByYear ) )[ rowSums( PubTypeByYear ) >= 50, ] 
#Limited to the largest 6 groups

PropPubTypeByYear.long <- gather(PropPubTypeByYear2, year, frequency, '1900':'2019', factor_key=TRUE)
ggplot(PropPubTypeByYear.long, aes(x = as.numeric(as.character(year)), y = as.numeric(frequency))) + geom_point() + geom_smooth() + facet_wrap(~names)
#looks like VERY LITTLE happens at the start of the distribution, few conclusions to be drawn

table( ReferenceList$"Publication Year" )
x <- ReferenceList[ReferenceList$"Publication Year" == 1968,]

#Try again only in years with 10+ observations
PubTypeByYear2 <- PubTypeByYear[,colSums(PubTypeByYear) >= 10]
PropPubTypeByYear3 <- prop.table( PubTypeByYear2, 2 )
names <- rownames( PropPubTypeByYear3 )
rownames( PropPubTypeByYear3 ) <- NULL
PropPubTypeByYear4 <- as.data.frame( cbind( names,PropPubTypeByYear3 ) )[ rowSums( PubTypeByYear ) >= 50, ] 

PropPubTypeByYear.long2 <- gather(PropPubTypeByYear4, year, frequency, '1968':'2018', factor_key=TRUE)
ggplot(PropPubTypeByYear.long2, aes(x = as.numeric(as.character(year)), y = as.numeric(frequency))) + geom_point() + geom_smooth() + facet_wrap(~names)

#Try it again over the last 40 years
PropPubTypeByYear.long3 <- gather(PropPubTypeByYear2, year, frequency, '1980':'2018', factor_key=TRUE)
ggplot(PropPubTypeByYear.long3, aes(x = as.numeric(as.character(year)), y = as.numeric(frequency))) + geom_point() + geom_smooth(se=FALSE) + facet_wrap(~names)


#From outliers
sort(boxplot(ReferenceList$"Publication Year")$out)
PropPubTypeByYear.long4 <- gather(PropPubTypeByYear2, year, frequency, '1980':'2017', factor_key=TRUE)
ggplot(PropPubTypeByYear.long4, aes(x = as.numeric(as.character(year)), y = as.numeric(frequency))) + geom_point() + geom_smooth(se=FALSE) + facet_wrap(~names)