#Loading in packages & data library(tidyverse) library(tm) ReferenceList <- read_csv("BibliometricAnalysis.csv") ###Summary of 'Publication Year' variable sort(table( ReferenceList$"Publication Year" ),decreasing = T) #how recent citations are summary( ReferenceList$"Publication Year" ) #how recent citations are ReferenceList[ReferenceList$"Publication Year" == 1900,] #the oldest cited article ggplot( ReferenceList, aes( as.numeric( ReferenceList$'Publication Year' ) ) ) + geom_histogram() #graphical summary ###Most frequently cited authors and journals ##Removing all punctuation and authors beyond the first from author variables so they can be compared #This has been replaced with piping since the first submission FirstAuthor <- ReferenceList$Author %>% sub("\\,.*", "",.) %>% #replacing sub("\\;.*", "",.) %>% iconv(to='ASCII//TRANSLIT') %>% removePunctuation() %>% tolower() #Most cited authors/author organizations FirstAuthor %>% table() %>% sort(decreasing = T) %>% head(n = 20) #ignore webb because there are 2 webbs in the data set (this was done manually) sum(table(FirstAuthor) > 10) - 5 #similarly remove allen, lee, schafer, smith, wang as there were many authors with these names, none of which #had more than 10 publications cited (also done manually) #(leave in Webb, because one of the two Webbs had more than 10 publucations) #Most Common Publication for most popular Authors (this is the 10 most common authors, and the most cited organization) #These were sifted through manually to ensure that similar titles were collapsed #(e.g., Kane had ' Validating the Interpretations and Uses of Test Scores' and 'VALIDATING THE INTERPRETATIONS AND USES OF TEST SCORES' #R sees these as different strings, even though the difference is just due to how metadata was reported.) MostCommonTitle <- cbind(ReferenceList,FirstAuthor) #matching most common authors with their publications table((MostCommonTitle$Title)[FirstAuthor == 'kane']) table((MostCommonTitle$Title)[FirstAuthor == 'abedi']) table((MostCommonTitle$Title)[FirstAuthor == 'hambleton']) table((MostCommonTitle$Title)[FirstAuthor == 'mattern']) table((MostCommonTitle$Title)[FirstAuthor == 'cizek']) table((MostCommonTitle$Title)[FirstAuthor == 'brennan']) table((MostCommonTitle$Title)[FirstAuthor == 'linn']) table((MostCommonTitle$Title)[FirstAuthor == 'sackett']) table((MostCommonTitle$Title)[FirstAuthor == 'cohen']) table((MostCommonTitle$Title)[FirstAuthor == 'raymond']) #organization as author table((MostCommonTitle$Title)[FirstAuthor == 'american educational research association']) ###Analysis of Journals cited sort(table(ReferenceList$'Item Type'),decreasing=T) JournalArticles <- ReferenceList[ReferenceList$'Item Type' == "journalArticle",] head(sort(table(JournalArticles$'Publication Title'), decreasing = T), n = 10) #Ten most cited journals unique(JournalArticles$`Publication Title`) #number of journals cited sum(table(JournalArticles$`Publication Title`) > 10) #number of journals that were cited 10 or more times ###Analysis of Journal Articles cited #Similar to author names, standardizing article names into "compareTitle" which is made for comparing the journal article titles compareJournalTitle <- ReferenceList$Title %>% subset(ReferenceList$`Item Type` == "journalArticle") %>% iconv(to = 'ASCII//TRANSLIT') %>% removePunctuation() %>% tolower() #The thirty most cited journal articles compareJournalTitle %>% table() %>% sort(decreasing = T) %>% head(n = 30) #Number of cited journal articles compareJournalTitleTable <- data.frame(table(compareJournalTitle)) nrow(compareJournalTitleTable) #Number of articles that were cited once sum(compareJournalTitleTable[,2] == 1) ###The most cited Book. Many iterations of the title were tested sum(ReferenceList$`Publication Title` == 'Educational Measurement' | ReferenceList$`Publication Title` == 'Educational measurement', na.rm = TRUE) + sum(ReferenceList$`Title` == 'Educational Measurement' | ReferenceList$`Title` == 'Educational measurement', na.rm = TRUE) ###Analysis of more recent developments such as webpages and computer programs webPages <- ReferenceList[ReferenceList$'Item Type' == "webpage",] table(webPages$`Publication Year`) table(computerProgram$`Publication Year`) computerProgram <- ReferenceList[ReferenceList$'Item Type' == "computerProgram",] ###Analysis of reports cited Reports <- ReferenceList[ReferenceList$'Item Type' == "report",] head(sort(table(Reports$'Author'), decreasing = T), n = 10) #who wrote the most reports that were cited compareReport <- ReferenceList$Title %>% subset(ReferenceList$`Item Type` == "report") %>% iconv(to = 'ASCII//TRANSLIT') %>% removePunctuation() %>% tolower() #The thirty most cited reports compareReport %>% table() %>% sort(decreasing = T) %>% head(n = 30) #Number of unique reports cited compareReportTable <- data.frame(table(compareReport)) nrow(compareReportTable) #Number of reports cited just once sum(compareReportTable[,2] == 1) ######## ####Extraneous Code (This analysis is not included in the final paper) ######## ### Summary of Publication Type by Year ##This section is just looking at the frequency of which publication types came from which years rowSums( PubTypeByYear ) #This section just turns the table of the proportion of citations from each year that are a certain type () PubTypeByYear <- table( ReferenceList$`Item Type`, ReferenceList$`Publication Year`) PropPubTypeByYear <- prop.table( PubTypeByYear, 2 ) names <- rownames( PropPubTypeByYear ) rownames( PropPubTypeByYear ) <- NULL PropPubTypeByYear2 <- as.data.frame( cbind( names,PropPubTypeByYear ) )[ rowSums( PubTypeByYear ) >= 50, ] #Limited to the largest 6 groups PropPubTypeByYear.long <- gather(PropPubTypeByYear2, year, frequency, '1900':'2019', factor_key=TRUE) ggplot(PropPubTypeByYear.long, aes(x = as.numeric(as.character(year)), y = as.numeric(frequency))) + geom_point() + geom_smooth() + facet_wrap(~names) #looks like VERY LITTLE happens at the start of the distribution, few conclusions to be drawn table( ReferenceList$"Publication Year" ) x <- ReferenceList[ReferenceList$"Publication Year" == 1968,] #Try again only in years with 10+ observations PubTypeByYear2 <- PubTypeByYear[,colSums(PubTypeByYear) >= 10] PropPubTypeByYear3 <- prop.table( PubTypeByYear2, 2 ) names <- rownames( PropPubTypeByYear3 ) rownames( PropPubTypeByYear3 ) <- NULL PropPubTypeByYear4 <- as.data.frame( cbind( names,PropPubTypeByYear3 ) )[ rowSums( PubTypeByYear ) >= 50, ] PropPubTypeByYear.long2 <- gather(PropPubTypeByYear4, year, frequency, '1968':'2018', factor_key=TRUE) ggplot(PropPubTypeByYear.long2, aes(x = as.numeric(as.character(year)), y = as.numeric(frequency))) + geom_point() + geom_smooth() + facet_wrap(~names) #Try it again over the last 40 years PropPubTypeByYear.long3 <- gather(PropPubTypeByYear2, year, frequency, '1980':'2018', factor_key=TRUE) ggplot(PropPubTypeByYear.long3, aes(x = as.numeric(as.character(year)), y = as.numeric(frequency))) + geom_point() + geom_smooth(se=FALSE) + facet_wrap(~names) #From outliers sort(boxplot(ReferenceList$"Publication Year")$out) PropPubTypeByYear.long4 <- gather(PropPubTypeByYear2, year, frequency, '1980':'2017', factor_key=TRUE) ggplot(PropPubTypeByYear.long4, aes(x = as.numeric(as.character(year)), y = as.numeric(frequency))) + geom_point() + geom_smooth(se=FALSE) + facet_wrap(~names)