###Loading in required packages and data
library(tidyverse)
KeywordList <- read_csv("KeywordAnalysis.csv")
KeywordList[is.na(KeywordList)] <- 0 #replacing NAs with 0

###Variable set up
nStudies <- ncol(KeywordList[,-IDs]) #total number of studies in sample
IDs <- c(1,2,3) #The first three columns are identification columns
fundingAndAuthor <- c(which(KeywordList$Keyword == 'Funding'),which(KeywordList$Keyword == 'Primary Author')) #The last two rows are two extra variables:
#funding is whether or not the study was funded
#primary author is whether the primary author was affiliated with an academic institution


###
###Overall
###

###Overall summaries
colSums(KeywordList[-fundingAndAuthor,-IDs]) #number of keywords in each study
cbind(KeywordList$Keyword,rowSums(KeywordList[,-IDs])) #number of times each unique 

###Our keyword analysis was divided into three categories: sample characteristics, assessment characteristics, and analytic characteristics 
##Analysis of Sample characteristic
Sample <- KeywordList[KeywordList[1] == 'Sample',]
SampleRowSums <- data_frame('keyword' = Sample$Keyword, 'rowsums' = rowSums(Sample[,-IDs])) #the number of studies with each sample keyword
sum(SampleRowSums[,2]) #the total number of sample keywords total across all 119 studies 
sampleColSums <- colSums(Sample[,-IDs]) #the number of sample keywords in each study
totalColWithSampleKeyword <- sum(sampleColSums > 0) #the number of studies with at least one sample keyword
propTotalColWithSampleKeyword <- sum(sampleColSums > 0)/nStudies #the proportion of studies with a sample keyword



#Analysis of Assessment
Assessment <- KeywordList[KeywordList[1] == 'Assessment',]
assessmentRowSums <- data_frame('keyword' = Assessment$Keyword, 'rowsums' = rowSums(Assessment[,-IDs])) #the number of studies with each assessment keyword
AssessmentColSums <- colSums(Assessment[,-IDs]) #the total number of assessment keywords total across all 119 studies
totalColWithAssessmentKeyword <- sum(AssessmentColSums > 0) #the number of studies with at least one assessment keyword
propTotalColWithAssessmentKeyword <- sum(AssessmentColSums > 0)/nStudies #the proportion of studies with an assessment keyword

##The assessment category was also broken into 3 subcategories: purpose, type, and test subject
sum(assessmentRowSums[which(Assessment$Subtheme == 'Purpose'),2]) #the total number of 'purpose' keywords cited across all studies
sum(assessmentRowSums[which(Assessment$Subtheme == 'Type'),2]) #the total number of 'type' keywords cited across all studies
sum(assessmentRowSums[which(Assessment$Subtheme == 'Test Subject'),2]) #the total number of 'test subject' keywords cited across all studies



#Analysis of Analytic
Analytic <- KeywordList[KeywordList[1] == 'Analytic',]
AnalyticRowSums <- data_frame('keyword' = Analytic$Keyword, 'rowsums' = rowSums(Analytic[,-IDs])) #the number of studies with each analytic keyword
AnalyticColSums <- colSums(Analytic[,-IDs]) #the total number of analytic keywords total across all 119 studies
totalColWithAnalyticKeyword <- sum(AnalyticColSums > 0) #the number of studies with at least one analytic keyword
propTotalColWithAnalyticKeyword <- sum(AnalyticColSums > 0)/nStudies #the proportion of studies with an analytic keyword



###
#Analysis of Funded
###
nFunded <- sum( KeywordList[KeywordList$Keyword == 'Funding',] == 1 ) #total number of studies that were funded
as.numeric( KeywordList[KeywordList$Keyword == 'Funding',] == 1 )
#creating a data frame with just the funded studies
funded <- KeywordList[,c(IDs,which(KeywordList[KeywordList$Keyword == 'Funding',] == 1 ))]

colSums(funded[-fundingAndAuthor,-IDs]) #the number of keywords in each funded study
rowSumsFunded <- data_frame('keyword' = funded$Keyword, 'rowsums' = rowSums(funded[,-IDs])) #the number of each keyword in funded studies
rowSumsFunded[rowSumsFunded[,2] != '0',] #every keyword that was in a funded study at least once.

comparison <- data_frame('keyword' = funded$Keyword, 'rowsumsF' = rowSums(funded[,-IDs]), 'rowsumsNF' = rowSums(KeywordList[,-IDs]) - rowSums(funded[,-IDs]), 'propDif' = (rowSums(funded[,-IDs])/nFunded) / ((rowSums(KeywordList[,-IDs]) - rowSums(funded[,-IDs]))/ (nStudies - nFunded)   ) )


###Analysis of Demographics
#This section is exactly the same as the previous demographic section, except with funded studies only
sample <- funded[funded[1] == 'Sample',] 
SampleRowSums <- data_frame('keyword' = Sample$Keyword, 'rowsums' = rowSums(Sample[,-IDs])) #the number of studies with each demographic keyword
sampleColSums <- colSums(sample[,-IDs]) #the total number of demographic keywords total across all 35 funded studies
totalColWithSamplefunded <- sum(sampleColSums > 0) #the number of studies with at least one demographic keyword
propTotalColWithSamplefunded <- sum(sampleColSums > 0)/nFunded #the proportion of studies with an demographic keyword

#test: ho: pi_dem = pi_dem_funded
#Is there a difference in the proportion of demographic keywords in funded vs non funded studies?
#No there is not
prop.test(c(totalColWithSampleKeyword - totalColWithSamplefunded,totalColWithSamplefunded), 
          c(nStudies - nFunded,nFunded))

#Analysis of Purpose
Assessment <- funded[funded[1] == 'Assessment',]
assessmentRowSums <- data_frame('keyword' = Assessment$Keyword, 'rowsums' = rowSums(Assessment[,-IDs])) #the number of studies with each assessment keyword
AssessmentColSums <- colSums(Assessment[,-IDs]) #the total number of assessment keywords total across all 35 funded studies
totalColWithAssessmentfunded <- sum(AssessmentColSums > 0) #the number of studies with at least one assessment keyword
propTotalColWithAssessmentfunded <- sum(AssessmentColSums > 0)/nFunded #the proportion of studies with an assessment keyword

#test: ho: pi_purp = pi_purp_funded
#Is there a difference in the proportion of assessment keywords in funded vs non funded studies?
#No there is not
prop.test(c(totalColWithAssessmentKeyword - totalColWithAssessmentfunded,totalColWithAssessmentfunded), 
          c(nStudies - nFunded,nFunded), 
          alternative = "two.sided")


#Analysis of Analytic
AnalyticF <- funded[funded[1] == 'Analytic',]
AnalyticRowSums <- data_frame('keyword' = AnalyticF$Keyword, 'rowsums' = rowSums(AnalyticF[,-IDs])) #the number of studies with each analytic keyword
AnalyticColSums <- colSums(AnalyticF[,-IDs]) #the total number of analytic keywords total across all 35 funded studies
totalColWithAnalyticfunded <- sum(AnalyticColSums > 0) #the number of studies with at least one analytic keyword
propTotalColWithAnalyticfunded <- sum(AnalyticColSums > 0)/nFunded #the proportion of studies with an analytic keyword


#test: ho: pi_dem = pi_anal_funded
#Is there a difference in the proportion of analytic keywords in funded vs non funded studies?
#No there is not
prop.test(c(totalColWithAnalyticKeyword - totalColWithAnalyticfunded,totalColWithAnalyticfunded), 
          c(nStudies - nFunded,nFunded), 
          alternative = "two.sided")


###This code is extraneous and did not make it into the final analysis. It is exactly the same as the previous section, except
###only includes studies where the primary author was affiliated with an academic institution
###
#Analysis of College Affiliated
###

nCollegeAffiliated <- sum( KeywordList[KeywordList$Keyword == 'Primary Author',] != 0 )
as.numeric( KeywordList[KeywordList$Keyword == 'Primary Author',] != 0 )
collegeAffiliated <- KeywordList[,c(which(KeywordList[KeywordList$Keyword == 'Primary Author',] != 0))]

colSums(collegeAffiliated[-fundingAndAuthor,-IDs])
rowSumscollegeAffiliated <- data_frame('keyword' = collegeAffiliated$Keyword, 'rowsums' = rowSums(collegeAffiliated[,-IDs]))
rowSumscollegeAffiliated[rowSumscollegeAffiliated[,2] != '0',]

comparison <- data_frame('keyword' = collegeAffiliated$Keyword, 'rowsumsCA' = rowSums(collegeAffiliated[,-IDs]), 'rowsumsNCA' = rowSums(KeywordList[,-IDs]) - rowSums(collegeAffiliated[,-IDs]), 'propDif' = (rowSums(collegeAffiliated[,-IDs])/nCollegeAffiliated) / ((rowSums(KeywordList[,-IDs]) - rowSums(collegeAffiliated[,-IDs]))/ (nStudies - nCollegeAffiliated)   ) )

###Analysis of Demographics
sample <- collegeAffiliated[collegeAffiliated[1] == 'Sample',]
SampleRowSums <- data_frame('keyword' = Sample$Keyword, 'rowsums' = rowSums(Sample[,-IDs]))
sampleColSums <- colSums(sample[,-IDs])
totalColWithSamplecollegeAffiliated <- sum(sampleColSums > 0)
propTotalColWithSamplecollegeAffiliated <- sum(sampleColSums > 0)/nCollegeAffiliated


#test: ho: pi_dem = pi_dem_colAf

prop.test(c(totalColWithSampleKeyword - totalColWithSamplecollegeAffiliated,totalColWithSamplecollegeAffiliated), 
          c(nStudies - nCollegeAffiliated,nCollegeAffiliated))

#Analysis of Purpose
Assessment <- collegeAffiliated[collegeAffiliated[1] == 'Assessment',]
assessmentRowSums <- data_frame('keyword' = Assessment$Keyword, 'rowsums' = rowSums(Assessment[,-IDs]))
AssessmentColSums <- colSums(Assessment[,-IDs])
totalColWithAssessmentcollegeAffiliated <- sum(AssessmentColSums > 0)
propTotalColWithAssessmentcollegeAffiliated <- sum(AssessmentColSums > 0)/nCollegeAffiliated

#test: ho: pi_dem = pi_purp_colAf

prop.test(c(totalColWithAssessmentKeyword - totalColWithAssessmentcollegeAffiliated,totalColWithAssessmentcollegeAffiliated), 
          c(nStudies - nCollegeAffiliated,nCollegeAffiliated))


#Analysis of Analytic
Analytic <- collegeAffiliated[collegeAffiliated[1] == 'Analytic',]
AnalyticRowSums <- data_frame('keyword' = Analytic$Keyword, 'rowsums' = rowSums(Analytic[,-IDs]))
AnalyticColSums <- colSums(Analytic[,-IDs])
totalColWithAnalyticcollegeAffiliated <- sum(AnalyticColSums > 0)
propTotalColWithAnalyticcollegeAffiliated <- sum(AnalyticColSums > 0)/nCollegeAffiliated

#test: ho: pi_dem = pi_anal_colAf

prop.test(c(totalColWithAnalyticKeyword - totalColWithAnalyticcollegeAffiliated,totalColWithAnalyticcollegeAffiliated), 
          c(nStudies - nCollegeAffiliated,nCollegeAffiliated), 
          alternative = "two.sided")