###DATA MANIPULATION CODE###
###Project: AS Code and data###
###Working directory: Box###

###Code to examine potential participants
#Counting number of counties represented 

data<-read.csv("My Documents/Data_with_identifiers_alltypes CSV.csv")

#How many represented from each state?
states<-data$CUST_STATE
unique.state<-unique(states, incomparables = FALSE)
length(unique.state)
counts_per_state<-aggregate(data.frame(count = states), list(value = states), length) #counts the number of entries per unique value in the dataset for state
counts_per_state

#How many represented from each county?

counties<-data$CNTYNAME
unique.county<-unique(counties, incomparables = FALSE)
length(unique.county)
counts_per_county<-aggregate(data.frame(count = counties), list(value = counties), length) #counts the number of entries per unique value in the dataset for counties


address<-unique(data$CUST_STREET, incomparables = FALSE)
length(address)

dups<-duplicated(address)
length(dups)

#The following code was used to generate the "linking document" found in the Box Drive. 
#library(ids)
#data$IDs<-random_id(n=4002, bytes=3)
#library(openxlsx)
#write.xlsx(data, "C:/Users/thom4412/Box/My Documents/Linking document.xlsx") #writes the current dataframe into XLSX file, saved on Box as "Linking Document"

###### Tracking survey response rate #####################################
library(readxl)
library(tidyverse)
mailing<-read_excel("C:/Users/thom4412/Desktop/MAISRC RFP 2018/Activity 2 Angler Survey/AS Code and data/01282021_Mailing Tracking Document.xlsx")
mailing %>% group_by(`Completed Survey`)%>%
  summarize(n=n())

###Examining response patterns####
linking<-read.csv("My Documents/Linking document2 CSV.csv")
merged<-merge(linking,mailing, by="IDs")

#create admin region variable
merged$CNTYNAME<-trimws(merged$CNTYNAME) #remove extra white space from cnty names

ref <- data.frame(CNTYNAME = c("Koochiching","Lake","Itasca", "Saint Louis", "Cook", "Carlton", "Pine", "Aitkin", "Crow Wing",
                               "Mille Lacs", "Kanabec", "Morrison","Isanti","Anoka","Chisago","Todd","Benton","Sherburne","Stearns","Wright","Hennepin","Ramsey","Washington","Carver","Scott","Dakota","Goodhue","Wabasha","Olmsted","Winona","Fillmore","Houston",
                               "Kittson","Roseau","Lake of the Woods","Pennington","Marshall","Red Lake", "Polk","Norman","Mahnomen","Clearwater","Hubbard","Beltrami","Wadena","Cass","Becker","Clay","Wilkin","Otter Tail","Traverse","Grant","Douglas","Stevens","Pope",
                               "Big Stone","Swift","Chippewa","Kandiyohi","Meeker", "McCleod","Sibley","Renville","Lac Qui Parle","Yellow Medicine","Lincoln","Lyon","Brown","Redwood","Nicollet","Le Sueur","Rice","Dodge","Steele","Waseca","Blue Earth", "Watonwan","Cottonwood","Murray","Pipestone","Rock","Nobles","Jackson","Martin","Faribault","Freeborn","Mower"),
                  admin=c(rep("Northeast",9),rep("Central",23), rep("Northwest",23),rep("South",32)), stringsAsFactors = FALSE)

merged<-merge(merged,ref,by="CNTYNAME",all.x = TRUE)
merged$admin[is.na(merged$admin)] <- "out of state"
#filter out zeros aka people whose survey did not reach them
merged$completed.survey<-ifelse(merged$`Completed Survey`==1|merged$`Completed Survey`==2,"yes",ifelse(merged$`Completed Survey`==0,"bb","no"))
merged$completed.survey[is.na(merged$completed.survey)] <- "no"
merged<-filter(merged,completed.survey!="bb")



#Remove the responses that were from duplicated IDs 
merged<-merged[!duplicated(merged$IDs), ]

merged %>% group_by(admin, completed.survey) %>%
  summarize(n=n())

#chisq test to check for independence
library(MASS)
#library(RVAideMemoire)
merged.table<-table(merged$admin,merged$completed.survey)
chisq.test(merged.table) #seeing significant differences let's look closer

#chisq.multcomp(merged.table)
#Global chisq shows significance...post hoc test below
chisq.test(merged.table)$stdres #call standardized residuals
sig<-0.05 #set significance level
sigAdj<-sig/(nrow(merged.table)*ncol(merged.table))
qnorm(sigAdj/2)

merged.frame<-data.frame(merged.table[,"no"],merged.table[,"yes"]) #convert to dataframe
colnames(merged.frame)<-c("no","yes") #rename columns
merged.frame<-mutate(merged.frame, prop.resp=yes/(yes+no),total=yes+no)

#global proportion test
prop.test(x=merged.table[,c("yes","no")]) #have to switch column order to match syntax of prop.test func

#pairwise with bonferroni correction
pairwise.prop.test(merged.table[,c("yes","no")],p.adjust.method = "bonferroni") #switch column order, apply bonferroni correction


#Age difference of respondents vs gen pop? 
survey.responses<-read.csv("Meg's Datasets/Identified Responses20210122.csv", strip.white = TRUE) #update with latest filename/date; see "Linking responses with identities" code for how the identified responses document was generated.
#trim white spaces from strings:
library(data.table)
survey.responses<-fread("Meg's Datasets/Identified Responses20210122.csv", strip.white = TRUE, data.table = FALSE)#File will be named with 1824444_Thompson_CombinedData_date of download.

#Remove the responses that were from duplicated IDs 
survey.responses<-survey.responses[!duplicated(survey.responses$PrimaryID), ]

mean(survey.responses$CUST_AGE)
mean(merged$CUST_AGE)

#create age category variable in the merged dataset
n<-length(merged$IDs)
merged$age_cat<-NA

for (i in 1:n){
  merged$age_cat[i]<-ifelse(merged$CUST_AGE[i]>=18 & merged$CUST_AGE[i]<=39,"18-39",
                      ifelse(merged$CUST_AGE[i]>=40 & merged$CUST_AGE[i]<=49,"40-49",
                        ifelse(merged$CUST_AGE[i]>=50 & merged$CUST_AGE[i]<=59,"50-59",
                       ifelse(merged$CUST_AGE[i]>=60 & merged$CUST_AGE[i]<=69,"60-69",
                        ifelse(merged$CUST_AGE[i]>=70,"70+",NA)))))
}

mageTab<-merged %>% group_by (completed.survey,age_cat) %>% summarize (n=n())

#create age category variable in the survey responses dataset
m<-length(survey.responses$PrimaryID)
survey.responses$age_cat<-NA
for (i in 1:m){
  survey.responses$age_cat[i]<-ifelse(survey.responses$CUST_AGE[i]>=18 & survey.responses$CUST_AGE[i]<=39,"18-39",
                            ifelse(survey.responses$CUST_AGE[i]>=40 & survey.responses$CUST_AGE[i]<=49,"40-49",
                            ifelse(survey.responses$CUST_AGE[i]>=50 & survey.responses$CUST_AGE[i]<=59,"50-59",
                            ifelse(survey.responses$CUST_AGE[i]>=60 & survey.responses$CUST_AGE[i]<=69,"60-69",
                            ifelse(survey.responses$CUST_AGE[i]>=70,"70+",NA)))))
}

sageTab<-survey.responses %>% group_by (age_cat) %>% summarize (n=n())

#tabulate response rates by age between the two data frames
ageTab<-table(merged$age_cat,merged$completed.survey)

#global proportion test
prop.test(x=ageTab[,c("yes","no")])

#pairwise
pairwise.prop.test(x=ageTab[,c("yes","no")], p.adjust.method = "bonferroni")

#these below are not corrected for multiple comparisons:
prop.test(x=c(306,2517), n=c(671,3788)) #18-39s significantly underrepresented in our sample
prop.test(x=c(63,326), n=c(671,3788)) #40-49 accurately represented in our sample
prop.test(x=c(97,365), n=c(671,3788)) #50-59 significantly overrepresented in our sample
prop.test(x=c(129,388), n=c(671,3788)) #60-69 significantly overrepresented 
prop.test(x=c(76,192), n=c(671,3788)) #70+ significantly overrepresented in our sample

#Apply weights to survey response calculation:
#weight=1/(sampleprop/populationprop)
weight1839<-1/(0.45/0.66)
weight4049<-1/(0.094/0.086)
weight5059<-1/(0.14/0.09)
weight6069<-1/(0.19/0.10)
weight70plus<-1/(0.11/0.05)

#Store age weights in weights array
weightsarray<-c(weight1839,weight4049,weight5059,weight6069,weight70plus)

#### Survey responses manipulation ##############################################
library(tidyverse)
library(fitdistrplus)

## 1/22/2021 Changes made to file in excel prior to reading in:
#4/6/20 made some adjustments in excel file, including changing "doz" to 12 and creating columns for presence/absence
#4/8/20 for fish numbers estimated using "scoop" I estimated a number of fish using the following:
#FHM runif(1,min=30,max=36) rounded to the nearest whole
#other shiners runif(1,30,36)
#redbelly dace aka "rainbows" runif(1,25,30)
#multiplied by the number of scoops to get an estimate
#Jan 22 2021 note: created columns Q16_1_release through Q16_5_noneleft to extract information from multiple select Q16.

MyTheme = theme(text = element_text(size=24), axis.text.x = element_text(size=14), axis.text.y = element_text(size=24)) 
survey.responses<-read.csv("Meg's Datasets/Identified Responses20210122.csv", strip.white = TRUE) #update with latest filename/date; see "Linking responses with identities" code for how the identified responses document was generated.
#trim white spaces from strings:
library(data.table)
survey.responses<-fread("Meg's Datasets/Identified Responses20210122.csv", strip.white = TRUE, data.table = FALSE)#File will be named with 1824444_Thompson_CombinedData_date of download.
#responses.fin<-filter(survey.responses, Finished=="True") #don't need this anymore since the Identified Responses  file already is filtered for completion
variables<-read.csv("C:/Users/thom4412/Box/1824444_Thompson/1824444_Thompson_CombinedData_VariableLabels.csv")

#Remove the responses that were from duplicated IDs 
survey.responses<-survey.responses[!duplicated(survey.responses$PrimaryID), ]

#create age category variable
m<-length(survey.responses$PrimaryID)
survey.responses$age_cat<-NA
for (i in 1:m){
  survey.responses$age_cat[i]<-ifelse(survey.responses$CUST_AGE[i]>=18 & survey.responses$CUST_AGE[i]<=39,"18-39",
                                      ifelse(survey.responses$CUST_AGE[i]>=40 & survey.responses$CUST_AGE[i]<=49,"40-49",
                                             ifelse(survey.responses$CUST_AGE[i]>=50 & survey.responses$CUST_AGE[i]<=59,"50-59",
                                                    ifelse(survey.responses$CUST_AGE[i]>=60 & survey.responses$CUST_AGE[i]<=69,"60-69",
                                                           ifelse(survey.responses$CUST_AGE[i]>=70,"70+",NA)))))
}

#Calculate weights for each age 
#weight=1/(sampleprop/populationprop)
weight1839<-1/(0.45/0.66)
weight4049<-1/(0.094/0.086)
weight5059<-1/(0.14/0.09)
weight6069<-1/(0.19/0.10)
weight70plus<-1/(0.11/0.05)
weightsarray<-c(weight1839,weight4049,weight5059,weight6069,weight70plus)


# Rename columns for number of days of fishing per month. (Survey Q1)
survey.responses<- rename(survey.responses, Days_Mar18="Q1_1_1") %>%
  rename(Days_Apr18="Q1_1_2") %>%
  rename(Days_May18="Q1_1_3") %>%
  rename(Days_Jun18="Q1_1_4") %>%
  rename(Days_Jul18="Q1_1_5") %>%
  rename(Days_Aug18="Q1_1_6") %>%
  rename(Days_Sep18="Q1_1_7") %>%
  rename(Days_Oct18="Q1_1_8") %>%
  rename(Days_Nov18="Q1_1_9") %>%
  rename(Days_Dec18="Q1_1_10") %>%
  rename(Days_Jan19="Q1_1_11") %>%
  rename(Days_Feb19="Q1_1_12") 

# Rename columns for number of days of baitfish use per month. (Survey Q2)
survey.responses <- rename(survey.responses, Days_bait_Mar18="Q2_1_1") %>%
  rename(Days_bait_Apr18="Q2_1_2") %>%
  rename(Days_bait_May18="Q2_1_3") %>%
  rename(Days_bait_Jun18="Q2_1_4") %>%
  rename(Days_bait_Jul18="Q2_1_5") %>%
  rename(Days_bait_Aug18="Q2_1_6") %>%
  rename(Days_bait_Sep18="Q2_1_7") %>%
  rename(Days_bait_Oct18="Q2_1_8") %>%
  rename(Days_bait_Nov18="Q2_1_9") %>%
  rename(Days_bait_Dec18="Q2_1_10") %>%
  rename(Days_bait_Jan19="Q2_1_11") %>%
  rename(Days_bait_Feb19="Q2_1_12") 

# Rename columns for harvest frequency and location (Paper Survey Section 1)
survey.responses <- rename(survey.responses, harvest_frequency="Q3") %>% #Never, Rarely, Often
  rename(harvest_zonea="Q5_1") %>%
  rename(harvest_zoneb="Q5_2") %>%
  rename(harvest_zonec="Q5_3") %>%
  #rename(harvest_outside="Q5a") %>%
  #Zones where they fished with the live baitfish they harvested (paper survey Q5)
  rename(harvest_fisha="Q7_1") %>%
  rename(harvest_fishb="Q7_2") %>%
  rename(harvest_fishc="Q7_3") %>%
  rename(harvest_fishoutside="Q5a")

# Rename columns for purchase frequency and location (Paper Survey Section 2)
survey.responses <-  rename(survey.responses, purchase_frequency="Q10") %>% #Never, Rarely, Often 
  rename(purchase_zonea="Q12_1") %>%
  rename(purchase_zoneb="Q12_2") %>%
  rename(purchase_zonec="Q12_3") %>%
  rename(purchase_outside="Q12a") %>%
  #Zones where they fished with the live baitfish they purchased (paper survey Q12)
  rename(purchase_fisha="Q14_1") %>%
  rename(purchase_fishb="Q14_2") %>%
  rename(purchase_fishc="Q14_3") %>%
  rename(purchase_fishoutside="Q14a") 
  
  

####create variable for number of fishing days per year
  #first make NAs for number of days fishing 0s
survey.responses$Days_Mar18 <- replace_na(survey.responses$Days_Mar18, 0)
survey.responses$Days_Apr18 <- replace_na(survey.responses$Days_Apr18, 0)
survey.responses$Days_May18 <- replace_na(survey.responses$Days_May18, 0)
survey.responses$Days_Jun18 <- replace_na(survey.responses$Days_Jun18, 0)
survey.responses$Days_Jul18 <- replace_na(survey.responses$Days_Jul18, 0)
survey.responses$Days_Aug18 <- replace_na(survey.responses$Days_Aug18, 0)
survey.responses$Days_Sep18 <- replace_na(survey.responses$Days_Sep18, 0)
survey.responses$Days_Oct18 <- replace_na(survey.responses$Days_Oct18, 0)
survey.responses$Days_Nov18 <- replace_na(survey.responses$Days_Nov18, 0)
survey.responses$Days_Dec18 <- replace_na(survey.responses$Days_Dec18, 0)
survey.responses$Days_Jan19 <- replace_na(survey.responses$Days_Jan19, 0)
survey.responses$Days_Feb19 <- replace_na(survey.responses$Days_Feb19, 0)

#make NAs for number of days fishing with bait 0s
survey.responses$Days_bait_Mar18 <- replace_na(survey.responses$Days_bait_Mar18, 0)
survey.responses$Days_bait_Apr18 <- replace_na(survey.responses$Days_bait_Apr18, 0)
survey.responses$Days_bait_May18 <- replace_na(survey.responses$Days_bait_May18, 0)
survey.responses$Days_bait_Jun18 <- replace_na(survey.responses$Days_bait_Jun18, 0)
survey.responses$Days_bait_Jul18 <- replace_na(survey.responses$Days_bait_Jul18, 0)
survey.responses$Days_bait_Aug18 <- replace_na(survey.responses$Days_bait_Aug18, 0)
survey.responses$Days_bait_Sep18 <- replace_na(survey.responses$Days_bait_Sep18, 0)
survey.responses$Days_bait_Oct18 <- replace_na(survey.responses$Days_bait_Oct18, 0)
survey.responses$Days_bait_Nov18 <- replace_na(survey.responses$Days_bait_Nov18, 0)
survey.responses$Days_bait_Dec18 <- replace_na(survey.responses$Days_bait_Dec18, 0)
survey.responses$Days_bait_Jan19 <- replace_na(survey.responses$Days_bait_Jan19, 0)
survey.responses$Days_bait_Feb19 <- replace_na(survey.responses$Days_bait_Feb19, 0)

  #add to get total trips per year
n<-length(survey.responses$PrimaryID)
for (i in 1:n) {
survey.responses$trips.per.year[i]<-sum(survey.responses$Days_Mar18[i], 
                                     survey.responses$Days_Apr18[i], 
                                     survey.responses$Days_May18[i],
                                     survey.responses$Days_Jun18[i],
                                     survey.responses$Days_Jul18[i],
                                     survey.responses$Days_Aug18[i],
                                     survey.responses$Days_Sep18[i],
                                     survey.responses$Days_Oct18[i],
                                     survey.responses$Days_Nov18[i],
                                     survey.responses$Days_Dec18[i],
                                     survey.responses$Days_Jan19[i],
                                     survey.responses$Days_Feb19[i])}



#Store current dataframe as object to go back to if issues arise
#save(survey.responses, file="survey_responses.RData")
#load("survey_responses.RData")

#Remove problematic rows that have no response for either purchase or harvest of live baitfish and did not fill anything out
survey.responses$bad<- ifelse(survey.responses$harvest_frequency=="" & survey.responses$purchase_frequency=="","bad","good")  #add a column on which to filter responses
survey.responses<-filter(survey.responses, bad=="good") #selects only "good" rows where people actually filled it out and went fishing


#Create columns for behavior during the month in question
#live.bait is a column with three levels: Harvest, Purchase, or did not use livebait 
survey.responses$live.bait<- ifelse(survey.responses$harvest_frequency=="Rarely", "Harvest",
                                    ifelse(survey.responses$harvest_frequency=="Often", "Harvest",
                                           ifelse(survey.responses$purchase_frequency=="Rarely", "Purchase",
                                                  ifelse(survey.responses$purchase_frequency=="Often","Purchase", 
                                                         ifelse(survey.responses$purchase_frequency=="Never" & survey.responses$harvest_frequency=="Never", "Did not use live bait",
                                                                ifelse(survey.responses$purchase_frequency=="" &survey.responses$harvest_frequency=="Never", "Did not use live bait",
                                                                       ifelse(survey.responses$purchase_frequency=="Never" &survey.responses$harvest_frequency=="", "Did not use live bait","Did not use live bait")))))))
#Summarize bait use and plot
survey.responses %>% group_by(live.bait) %>%
  summarize(n=n())
baituse.plot<-ggplot(survey.responses)+aes(live.bait)+geom_bar(color="darkturquoise", fill="darkturquoise",width = 0.5)+ggtitle("Bait source among surveyed Minnesota anglers")+MyTheme+xlab("Source of live baitfish")
baituse.plot

#Create variable for administrative regions based on county
survey.responses$CNTYNAME<-trimws(survey.responses$CNTYNAME) #remove extra white space from cnty names

ref <- data.frame(CNTYNAME = c("Koochiching","Lake","Itasca", "Saint Louis", "Cook", "Carlton", "Pine", "Aitkin", "Crow Wing",
                                "Mille Lacs", "Kanabec", "Morrison","Isanti","Anoka","Chisago","Todd","Benton","Sherburne","Stearns","Wright","Hennepin","Ramsey","Washington","Carver","Scott","Dakota","Goodhue","Wabasha","Olmsted","Winona","Fillmore","Houston",
                               "Kittson","Roseau","Lake of the Woods","Pennington","Marshall","Red Lake", "Polk","Norman","Mahnomen","Clearwater","Hubbard","Beltrami","Wadena","Cass","Becker","Clay","Wilkin","Otter Tail","Traverse","Grant","Douglas","Stevens","Pope",
                               "Big Stone","Swift","Chippewa","Kandiyohi","Meeker", "McCleod","Sibley","Renville","Lac Qui Parle","Yellow Medicine","Lincoln","Lyon","Brown","Redwood","Nicollet","Le Sueur","Rice","Dodge","Steele","Waseca","Blue Earth", "Watonwan","Cottonwood","Murray","Pipestone","Rock","Nobles","Jackson","Martin","Faribault","Freeborn","Mower"),
                            admin=c(rep("Northeast",9),rep("Central",23), rep("Northwest",23),rep("South",32)), stringsAsFactors = FALSE)

survey.responses<-merge(survey.responses,ref,by="CNTYNAME",all.x = TRUE)
survey.responses$admin[is.na(survey.responses$admin)] <- "out of state"
survey.responses %>% group_by(CUST_STATE,admin) %>%
  summarize(n=n())

#plot bait use by administrative area
baituse.plot2<-ggplot(survey.responses)+aes(live.bait,fill=admin)+geom_bar(width = 0.5)+ggtitle("Bait source among surveyed Minnesota anglers")+MyTheme+xlab("Source of live baitfish")
baituse.plot2
survey.responses %>% group_by(admin,live.bait) %>%
  summarize(n=n())
chisq.test(survey.responses$live.bait,survey.responses$admin)
#plot bait use by administrative area, proportion
baituse.plot3<-ggplot(survey.responses)+aes(admin, fill=live.bait)+geom_bar(width = 0.5,position="fill")+ggtitle("Bait source among surveyed Minnesota anglers")+MyTheme+xlab("Region")
baituse.plot3

#Segment based on bait use
survey.responses.harvesters<-filter(survey.responses,live.bait=="Harvest")
survey.responses.purchasers<-filter(survey.responses,live.bait=="Purchase")
survey.responses.baitusers<-rbind(filter(survey.responses,live.bait=="Harvest"),filter(survey.responses,live.bait=="Purchase"))
survey.responses.nonbait<-filter(survey.responses,live.bait=="Did not use live bait")

#Weighted release rate
sbdt <- data.table(survey.responses.baitusers) 

sbdt[, .N, by = .(age_cat)] 
sbdt[ , .N, by = .(age_cat, Q16_1_release)] 
sbdt2 <- merge(sbdt[ , .N, by = .(age_cat, Q16_1_release)],sbdt[, .N, by = .(age_cat)], by = "age_cat") #Creates one dataframe with number of respondents by disposal method per location as well as total number of respondents per location
sbdt2[,prop := N.x/N.y , ]

release.by.age<-sbdt2[Q16_1_release==TRUE,prop] #array of proportions of anglers who released per age category

#weighted average release rate: 
sum(release.by.age*weightsarray)/sum(weightsarray)

#weighted use of live bait
survey.responses %>% group_by(age_cat,live.bait) %>% summarize (n=n())
srdt<-data.table(survey.responses)
srdt2<- merge(srdt[ , .N, by = .(age_cat, live.bait)],srdt[, .N, by = .(age_cat)], by = "age_cat") #Creates one dataframe with number of respondents by disposal method per location as well as total number of respondents per location
srdt2[,prop := N.x/N.y , ]
srdt2

use.by.age<-srdt2[live.bait=="Did not use live bait",prop] #array of proportions of anglers who did not use live bait by age
sum(use.by.age*weightsarray)/sum(weightsarray)


survey.responses.baitusers$disposal_method<- ifelse(survey.responses.baitusers$Q16_1_release==TRUE,"Release in water",
                                   ifelse(survey.responses.baitusers$Q16_2_dispose==TRUE,"Properly disposed",
                                   ifelse(survey.responses.baitusers$Q16_3_giveaway==TRUE, "Gave away",
                                   ifelse(survey.responses.baitusers$Q16_4_other==TRUE,"Other",
                                   ifelse(survey.responses.baitusers$Q16_5_noneleft==TRUE,"None leftover",
                                  #code for harvest disposal methods:
                                  ifelse(survey.responses.baitusers$Q9=="Released into different water from which it was caught","Release in water",
                                  ifelse(survey.responses.baitusers$Q9=="Released into the same water from which it was caught", "Back to source",
                                  ifelse(survey.responses.baitusers$Q9=="Disposed on land/in garbage","Properly disposed",
                                  ifelse(survey.responses.baitusers$Q9=="Gave to a friend", "Gave away",
                                  ifelse(survey.responses.baitusers$Q9=="Other (please describe)","Other",
                                  ifelse(survey.responses.baitusers$Q9=="I did not have any leftover baitfish","None leftover",
                                  #code for no response
                                  ifelse(survey.responses.baitusers$Q9==""&
                                  survey.responses.baitusers$Q16=="",
                                  "Did not respond", ""))))))))))))

#save current frame 
#save(survey.responses.baitusers, file="survey_responses_baitusers.RData")

#plot 
survey.responses.baitusers %>% group_by(disposal_method) %>%
  summarize(n=n())
#plot for papers:
library(forcats)
survey.responses.baitusers %>%
  count(disposal_method) %>%
  mutate(disposal_method = fct_reorder(disposal_method, n, .desc = TRUE)) %>%
  ggplot(aes(x = disposal_method, y = n)) + geom_bar(stat="identity")+
  coord_flip()+
  theme_bw()

#plot for presentations:
disposal.plot<-ggplot(survey.responses.baitusers)+
  aes(x=disposal_method, fill=live.bait) + 
  geom_bar()+ 
  coord_flip() + 
  ggtitle("Method of disposal by bait source among surveyed Minnesota anglers") + 
  xlab("Method of disposal") + labs(fill = "Source") +
  #theme(MyTheme)
  theme_bw()+
  theme(text = element_text(size=24), axis.text.x = element_text(size=18), axis.text.y = element_text(size=24)) 
disposal.plot

#plot disposal by admin area
disposal.plot2<-ggplot(survey.responses.baitusers)+
  aes(disposal_method, fill=admin) + geom_bar()+ 
  coord_flip() + 
  ggtitle("Method of disposal by bait source among surveyed Minnesota anglers") + 
  xlab("Method of disposal") + 
  labs(fill = "Region") + 
  theme(text = element_text(size=24), axis.text.x = element_text(size=18), axis.text.y = element_text(size=24)) +
  theme_bw() +
  scale_fill_grey()
disposal.plot2
#disposal plot by admin region of residence

#make a column for binary yes/no release or not--REMOVED 1/22/21 because colum q16_1_release answers this question
#survey.responses.baitusers$releaseyn<-ifelse(survey.responses.baitusers$disposal_method=="Release in water","yes","no")

#plot release percentage by admin area GRAYSCALE for publication: disposal_by_admin_grey.jpeg
library(dplyr)
p <- survey.responses.baitusers %>%
  mutate(admin = fct_relevel(admin, 
                            "Central", "South","Northeast","Northwest", "out of state")) %>%
  ggplot( aes(x=admin, fill=disposal_method)) +
  geom_bar(position = "fill") + MyTheme + xlab("Region" ) + ylab("proportion") +
  theme(axis.text.x = element_text(angle=45,vjust = .5)) +
  theme_bw()+
  scale_fill_grey()+
  labs(fill="Disposal method")
p

#plot release percentage by admin area COLOR for publication: disposal_by_admin_grey.jpeg
#only 6 categories, release back to source was grouped with "properly disposed", colorblind friendly, okay for grayscale printing
pcolor <- survey.responses.baitusers %>%
  mutate(admin = fct_relevel(admin, 
                             "Central", "South","Northeast","Northwest", "out of state"))%>%
  mutate(disposal_method = ifelse(as.character(disposal_method)=="Back to source","Properly disposed",
                                  as.character(disposal_method))) %>%
  mutate(disposal_method=fct_relevel(disposal_method,
                                      "Did not respond", "None leftover", "Other","Gave away",
                                     "Properly disposed", "Release in water")) %>%
  ggplot( aes(x=admin, fill=disposal_method)) +
  geom_bar(position = "fill") + MyTheme + xlab("Region" ) + ylab("proportion") +
  theme(axis.text.x = element_text(angle=45,vjust = .5)) +
  theme_bw()+
  scale_fill_manual(values=c("#FFDBD4", "#FFC107", "#EF620D","#1E88E5","#D81B60", "#004D40"))+
  labs(fill="Disposal method")
pcolor

#plot release percentage by admin area PATTERN
library(ggpattern)
ppattern <- survey.responses.baitusers %>%
  mutate(admin = fct_relevel(admin, 
                             "Central", "South","Northeast","Northwest", "out of state")) %>%
  ggplot(aes(fill=disposal_method, x=admin)) +
  geom_bar_pattern(position = "fill", 
    aes(pattern=disposal_method, pattern_angle=disposal_method, pattern_spacing=disposal_method),
    fill = 'white',
    colour  = 'black',
    pattern_density = 0.1,
    pattern_fill    = 'black',
    pattern_colour  = 'black') +
  MyTheme + 
  xlab("Region" ) + ylab("proportion") +
  theme(axis.text.x = element_text(angle=45,vjust = .5)) +
  theme_bw()+
  labs(fill="Disposal method")
ppattern

#test for significant differences in release behavior across different regions of residence
library(MASS)
region.table<-table(survey.responses.baitusers$admin,survey.responses.baitusers$Q16_1_release)
chisq.test(region.table)
#no significant difference, however small cell values (ie low statistical power)

####Create columns for destination fishing zones####
survey.responses.baitusers <- mutate_at(survey.responses.baitusers,c("harvest_fisha", "harvest_fishb", "harvest_fishc", "purchase_fisha", "purchase_fishb", "purchase_fishc"), as.character)

###Change zone NAs to 0s so that the ifelse commands below don't return NAs#####
survey.responses.baitusers$harvest_fisha <- replace_na(survey.responses.baitusers$harvest_fisha, 0)
survey.responses.baitusers$harvest_fishb <- replace_na(survey.responses.baitusers$harvest_fishb, 0)
survey.responses.baitusers$harvest_fishc <- replace_na(survey.responses.baitusers$harvest_fishc, 0)
survey.responses.baitusers$purchase_fisha <- replace_na(survey.responses.baitusers$purchase_fisha, 0)
survey.responses.baitusers$purchase_fishb <- replace_na(survey.responses.baitusers$purchase_fishb, 0)
survey.responses.baitusers$purchase_fishc <- replace_na(survey.responses.baitusers$purchase_fishc, 0)


#Create a column for each zone to state whether that angler fished there at all during this month

survey.responses.baitusers$fish_zone4<-ifelse(survey.responses.baitusers$purchase_fisha=="4","yes",
                                              ifelse(survey.responses.baitusers$purchase_fishb=="4","yes",
                                                     ifelse(survey.responses.baitusers$purchase_fishc=="4","yes",
                                                            ifelse(survey.responses.baitusers$harvest_fisha=="4", "yes",
                                                                   ifelse(survey.responses.baitusers$harvest_fishb=="4", "yes",
                                                                          ifelse(survey.responses.baitusers$harvest_fishc=="4","yes","no"))))))
                   
survey.responses.baitusers$fish_zone4<-ifelse(survey.responses.baitusers$purchase_fisha=="4","yes",
                                              ifelse(survey.responses.baitusers$purchase_fishb=="4","yes",
                                                     ifelse(survey.responses.baitusers$purchase_fishc=="4","yes",
                                                            ifelse(survey.responses.baitusers$harvest_fisha=="4", "yes",
                                                                   ifelse(survey.responses.baitusers$harvest_fishb=="4", "yes",
                                                                          ifelse(survey.responses.baitusers$harvest_fishc=="4","yes","no"))))))

survey.responses.baitusers$fish_zone1<-ifelse(survey.responses.baitusers$purchase_fisha=="1","yes",
                                              ifelse(survey.responses.baitusers$purchase_fishb=="1","yes",
                                                     ifelse(survey.responses.baitusers$purchase_fishc=="1","yes",
                                                            ifelse(survey.responses.baitusers$harvest_fisha=="1", "yes",
                                                                   ifelse(survey.responses.baitusers$harvest_fishb=="1", "yes",
                                                                          ifelse(survey.responses.baitusers$harvest_fishc=="1","yes","no"))))))
survey.responses.baitusers$fish_zone2<-ifelse(survey.responses.baitusers$purchase_fisha=="2","yes",
                                              ifelse(survey.responses.baitusers$purchase_fishb=="2","yes",
                                                     ifelse(survey.responses.baitusers$purchase_fishc=="2","yes",
                                                            ifelse(survey.responses.baitusers$harvest_fisha=="2", "yes",
                                                                   ifelse(survey.responses.baitusers$harvest_fishb=="2", "yes",
                                                                          ifelse(survey.responses.baitusers$harvest_fishc=="2","yes","no"))))))
survey.responses.baitusers$fish_zone3<-ifelse(survey.responses.baitusers$purchase_fisha=="3","yes",
                                              ifelse(survey.responses.baitusers$purchase_fishb=="3","yes",
                                                     ifelse(survey.responses.baitusers$purchase_fishc=="3","yes",
                                                            ifelse(survey.responses.baitusers$harvest_fisha=="3", "yes",
                                                                   ifelse(survey.responses.baitusers$harvest_fishb=="3", "yes",
                                                                          ifelse(survey.responses.baitusers$harvest_fishc=="3","yes","no"))))))

survey.responses.baitusers$fish_zone5<-ifelse(survey.responses.baitusers$purchase_fisha=="5","yes",
                                              ifelse(survey.responses.baitusers$purchase_fishb=="5","yes",
                                                     ifelse(survey.responses.baitusers$purchase_fishc=="5","yes",
                                                            ifelse(survey.responses.baitusers$harvest_fisha=="5", "yes",
                                                                   ifelse(survey.responses.baitusers$harvest_fishb=="5", "yes",
                                                                          ifelse(survey.responses.baitusers$harvest_fishc=="5","yes","no"))))))
survey.responses.baitusers$fish_zone6<-ifelse(survey.responses.baitusers$purchase_fisha=="6","yes",
                                              ifelse(survey.responses.baitusers$purchase_fishb=="6","yes",
                                                     ifelse(survey.responses.baitusers$purchase_fishc=="6","yes",
                                                            ifelse(survey.responses.baitusers$harvest_fisha=="6", "yes",
                                                                   ifelse(survey.responses.baitusers$harvest_fishb=="6", "yes",
                                                                          ifelse(survey.responses.baitusers$harvest_fishc=="6","yes","no"))))))
survey.responses.baitusers$fish_zone7<-ifelse(survey.responses.baitusers$purchase_fisha=="7","yes",
                                              ifelse(survey.responses.baitusers$purchase_fishb=="7","yes",
                                                     ifelse(survey.responses.baitusers$purchase_fishc=="7","yes",
                                                            ifelse(survey.responses.baitusers$harvest_fisha=="7", "yes",
                                                                   ifelse(survey.responses.baitusers$harvest_fishb=="7", "yes",
                                                                          ifelse(survey.responses.baitusers$harvest_fishc=="7","yes","no"))))))
survey.responses.baitusers$fish_zone8<-ifelse(survey.responses.baitusers$purchase_fisha=="8","yes",
                                              ifelse(survey.responses.baitusers$purchase_fishb=="8","yes",
                                                     ifelse(survey.responses.baitusers$purchase_fishc=="8","yes",
                                                            ifelse(survey.responses.baitusers$harvest_fisha=="8", "yes",
                                                                   ifelse(survey.responses.baitusers$harvest_fishb=="8", "yes",
                                                                          ifelse(survey.responses.baitusers$harvest_fishc=="8","yes","no"))))))
survey.responses.baitusers$fish_zone9<-ifelse(survey.responses.baitusers$purchase_fisha=="9","yes",
                                              ifelse(survey.responses.baitusers$purchase_fishb=="9","yes",
                                                     ifelse(survey.responses.baitusers$purchase_fishc=="9","yes",
                                                            ifelse(survey.responses.baitusers$harvest_fisha=="9", "yes",
                                                                   ifelse(survey.responses.baitusers$harvest_fishb=="9", "yes",
                                                                          ifelse(survey.responses.baitusers$harvest_fishc=="9","yes","no"))))))
survey.responses.baitusers$fish_zone10<-ifelse(survey.responses.baitusers$purchase_fisha=="10","yes",
                                              ifelse(survey.responses.baitusers$purchase_fishb=="10","yes",
                                                     ifelse(survey.responses.baitusers$purchase_fishc=="10","yes",
                                                            ifelse(survey.responses.baitusers$harvest_fisha=="10", "yes",
                                                                   ifelse(survey.responses.baitusers$harvest_fishb=="10", "yes",
                                                                          ifelse(survey.responses.baitusers$harvest_fishc=="10","yes","no"))))))
survey.responses.baitusers$fish_zone11<-ifelse(survey.responses.baitusers$purchase_fisha=="11","yes",
                                              ifelse(survey.responses.baitusers$purchase_fishb=="11","yes",
                                                     ifelse(survey.responses.baitusers$purchase_fishc=="11","yes",
                                                            ifelse(survey.responses.baitusers$harvest_fisha=="11", "yes",
                                                                   ifelse(survey.responses.baitusers$harvest_fishb=="11", "yes",
                                                                          ifelse(survey.responses.baitusers$harvest_fishc=="11","yes","no"))))))
survey.responses.baitusers$fish_zone12<-ifelse(survey.responses.baitusers$purchase_fisha=="12","yes",
                                              ifelse(survey.responses.baitusers$purchase_fishb=="12","yes",
                                                     ifelse(survey.responses.baitusers$purchase_fishc=="12","yes",
                                                            ifelse(survey.responses.baitusers$harvest_fisha=="12", "yes",
                                                                   ifelse(survey.responses.baitusers$harvest_fishb=="12", "yes",
                                                                          ifelse(survey.responses.baitusers$harvest_fishc=="12","yes","no"))))))
#make dataframe long
baitusers.long<-gather(survey.responses.baitusers,"zone", "choice", fish_zone1,fish_zone2,fish_zone3,fish_zone4,
                       fish_zone5,fish_zone6,fish_zone7,fish_zone8,fish_zone9,fish_zone10,fish_zone11,fish_zone12)
baitusers.long<-filter(baitusers.long,choice=="yes")
ggplot(baitusers.long)+aes(zone,fill=Q16_1_release)+geom_bar()
ggplot(baitusers.long)+aes(zone,fill=CUST_STATE)+geom_bar(position="fill")

#Trips per zone: 
fish_zone1<-nrow(filter(survey.responses.baitusers,fish_zone1=="yes"))
fish_zone2<-nrow(filter(survey.responses.baitusers,fish_zone2=="yes"))
fish_zone3<-nrow(filter(survey.responses.baitusers,fish_zone3=="yes"))
fish_zone4<-nrow(filter(survey.responses.baitusers,fish_zone4=="yes"))
fish_zone5<-nrow(filter(survey.responses.baitusers,fish_zone5=="yes"))
fish_zone6<-nrow(filter(survey.responses.baitusers,fish_zone6=="yes"))
fish_zone7<-nrow(filter(survey.responses.baitusers,fish_zone7=="yes"))
fish_zone8<-nrow(filter(survey.responses.baitusers,fish_zone8=="yes"))
fish_zone9<-nrow(filter(survey.responses.baitusers,fish_zone9=="yes"))
fish_zone10<-nrow(filter(survey.responses.baitusers,fish_zone10=="yes"))
fish_zone11<-nrow(filter(survey.responses.baitusers,fish_zone11=="yes"))
fish_zone12<-nrow(filter(survey.responses.baitusers,fish_zone12=="yes"))
trips.per.zone<-data.frame(fish_zone1,fish_zone2,fish_zone3,fish_zone4,fish_zone5,fish_zone6,
                           fish_zone7,fish_zone8,fish_zone9,fish_zone10,fish_zone11,fish_zone12)
tpz<-gather(trips.per.zone,key = "zone", "Number_trips", fish_zone1,fish_zone2,fish_zone3,fish_zone4,fish_zone5,fish_zone6,
            fish_zone7,fish_zone8,fish_zone9,fish_zone10,fish_zone11,fish_zone12)

#create data tables and merge 
library(data.table)
df<-data.table(tpz) 
df2<-data.table(baitusers.long)

df2[,.N,by=.(zone,disposal_method)] #gives number of each disposal method by zone
dfm<-merge(df2[,.N,by=.(zone,disposal_method)],df) #merges on zone
dfm[,.prop:=N/Number_trips,] #create new column for the proportions within each zone
dfx<-merge(df2[,.N,by=.(zone,Q16_1_release)],df) #merges on zone
dfx[,.prop:=N/Number_trips,] #create new column for the proportions within each zone
dfx

#test for independence between release behavior and zone of fishing
release_by_zone<-data.table(read.csv("release_by_zone.csv",row.names=1))
chisq.test(release_by_zone)

#test for significant difference in number of trips per year between bait users and nonbait users
t.test(survey.responses.baitusers$trips.per.year,survey.responses.nonbait$trips.per.year)

###Calculating average number of fish brought on fishing trip

#rename columns
survey.responses.baitusers<-rename(survey.responses.baitusers,whitesuckers="Q15_1_1") %>%
  rename(fatheadminnows="Q15_2_1") %>%
  rename(goldenshiners="Q15_3_1") %>%
  rename(othershiners= "Q15_4_1") %>%
  rename(redbellydace= "Q15_5_1") %>%
  rename(redtailchubs= "Q15_6_1") %>%
  rename(otherchubs="Q15_7_1") %>%
  rename(otherfish="Q15_8_1") 



#change into numbers; anything that includes text (eg "some" or "x" becomes NA)
survey.responses.baitusers$whitesuckers<-as.numeric(as.character(survey.responses.baitusers$whitesuckers),stringsAsFactors=FALSE)
survey.responses.baitusers$fatheadminnows<-as.numeric(as.character(survey.responses.baitusers$fatheadminnows),stringsAsFactors=FALSE)
survey.responses.baitusers$goldenshiners<-as.numeric(as.character(survey.responses.baitusers$goldenshiners),stringsAsFactors=FALSE)
survey.responses.baitusers$othershiners<-as.numeric(as.character(survey.responses.baitusers$othershiners),stringsAsFactors=FALSE)
survey.responses.baitusers$redbellydace<-as.numeric(as.character(survey.responses.baitusers$redbellydace),stringsAsFactors=FALSE)
survey.responses.baitusers$redtailchubs<-as.numeric(as.character(survey.responses.baitusers$redtailchubs),stringsAsFactors=FALSE)
survey.responses.baitusers$otherchubs<-as.numeric(as.character(survey.responses.baitusers$otherchubs),stringsAsFactors=FALSE)
survey.responses.baitusers$otherfish<-as.numeric(as.character(survey.responses.baitusers$otherfish),stringsAsFactors=FALSE)

#summarize average number of fish purchased and number of anglers who purchased that type of fish
library(reshape2)
library(data.table)
typefish<-survey.responses.baitusers[97:112] #select just columns of interest
#melt data to make long
SRBlong<-reshape2::melt(typefish,measure.vars=c("whitesuckers","WTS.yn","fatheadminnows","FHM.yn","goldenshiners",
                                 "GSH.yn","othershiners","OSH.yn","redbellydace","RBD.yn","redtailchubs",
                                 "RTC.yn","otherchubs","OTC.yn","otherfish","OFS.yn"),
              variable.name = "fish",
              value.name = "number")

#remove any fish number that is 0 to get an average number of fish purchased among anglers who purchased that particular species (don't zero-inflate mean and SD)
SRBlong<-subset(SRBlong,number>0)

#convert to data table  
SRBlong.dt<-data.table(SRBlong)

#use data.table
setkey(SRBlong.dt,fish)
typefish.summary<-SRBlong.dt[,list(totalusers=sum(number,na.rm = TRUE),
                meanfish=mean(number,na.rm = TRUE),
                sdfish=sd(number,na.rm = TRUE),
                maxfish=max(number,na.rm = TRUE)),
                by=fish]

#Count total number of fish (of any species) brought on a trip
survey.responses.baitusers$fish.per.trip<-NA

#loop to calculate total number of fish for each bait user
for(i in 1:length(survey.responses.baitusers$PrimaryID)){
  survey.responses.baitusers[i,"fish.per.trip"]<-sum(survey.responses.baitusers[i,"whitesuckers"],
                                                     survey.responses.baitusers[i,"fatheadminnows"],
                                                     survey.responses.baitusers[i,"goldenshiners"],
                                                     survey.responses.baitusers[i,"othershiners"],
                                                     survey.responses.baitusers[i,"redbellydace"],
                                                     survey.responses.baitusers[i,"redtailchubs"],
                                                     survey.responses.baitusers[i,"otherchubs"],
                                                     survey.responses.baitusers[i,"otherfish"],na.rm = TRUE)
}

#calculate mean and SD and look at distribution
mean(survey.responses.baitusers$fish.per.trip,na.rm = TRUE)
sd(survey.responses.baitusers$fish.per.trip,na.rm = TRUE)
ggplot(survey.responses.baitusers)+geom_histogram(aes(fish.per.trip),binwidth = 10)
 

#large spike...what's the most commonly reported number of fish?
getmode <- function(v) { #create mode function because there isn't a built in one in R
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}
getmode(survey.responses.baitusers$fish.per.trip)
#24 is the most commonly reported number of fish; likely artifact because of use of "dozen" in reporting



##How do species purchases vary by month? 
SRBlong2<-reshape2::melt(survey.responses.baitusers,measure.vars=c("whitesuckers","fatheadminnows","goldenshiners",
                                      "othershiners","redbellydace","redtailchubs",
                                      "otherchubs","otherfish"),
              variable.name = "fish",
              value.name = "number")
SRBlong2<-subset(SRBlong2,number>0)
SRBlong2<-subset(SRBlong2,Q3a != "")

SRBlong2%>% group_by(Q3a) %>% summarize(n=n()) #remove missing months (n-7)

SRBlong2$Q3a<-factor(SRBlong2$Q3a,levels=c("January","February","March","April","May","June",
                                           "July","August","September",
                                          "October", "November", "December"))

##Count total number of AFT-susceptible fish brought on a trip
survey.responses.baitusers$AFT.susc<-NA

#loop to calculate total number of AFT-susc fish for each bait user
for(i in 1:length(survey.responses.baitusers$PrimaryID)){
  survey.responses.baitusers[i,"AFT.susc"]<-sum(survey.responses.baitusers[i,"fatheadminnows"],
                                                survey.responses.baitusers[i,"goldenshiners"],
                                                survey.responses.baitusers[i,"othershiners"],
                                                survey.responses.baitusers[i,"otherchubs"],
                                                na.rm = TRUE)
}

#filter out anglers who did not use AFT:
AFTusers<-filter(survey.responses.baitusers,AFT.susc>0)

#fit distribution to total number of AFT-susc fish used by anglers who used AFT-susc fish
max(AFTusers$AFT.susc)
min(AFTusers$AFT.susc)
mean(AFTusers$AFT.susc)
hist(AFTusers$AFT.susc)
fitdistrplus::fitdist(AFTusers$AFT.susc,distr=dexp,method="mle")
#rate=0.0248, Beta=40.3


#create season variable: 
SRBlong2$season<-ifelse(SRBlong2$Q3a=="May"|
                          SRBlong2$Q3a=="June"|
                          SRBlong2$Q3a=="July"|
                          SRBlong2$Q3a=="August"|
                          SRBlong2$Q3a=="September"|
                          SRBlong2$Q3a=="October","open",
                          "ice")


#plot number of each fish species purchased by month:
ggplot(SRBlong2, aes(Q3a, number)) + 
  geom_bar(aes(fill = fish), stat = "summary", position = "dodge",fun.y="mean")

#plot number of fish per species by season: 
SRBlong2$season<-ifelse(SRBlong2$Q3a=="May"|
                          SRBlong2$Q3a=="June"|
                          SRBlong2$Q3a=="July"|
                          SRBlong2$Q3a=="August"|
                          SRBlong2$Q3a=="September"|
                          SRBlong2$Q3a=="October","open",
                        "ice")
ggplot(SRBlong2, aes(season, number)) + 
  geom_bar(aes(fill = fish), stat = "summary", position = "dodge",fun.y="mean")+ylab('mean number purchased')


summary(lm(data=SRBlong2,number~fish+1))
library(car)
Anova(lm(data=SRBlong2,number~fish))
mod<-aov(data=SRBlong2,number~fish*season)
summary(mod)
TukeyHSD(mod)

##Is there a seasonal difference in purchasing of OVIO-susc fish (ie golden shiners?)
survey.responses.baitusers$season<-ifelse(survey.responses.baitusers$Q3a=="May"|
                                            survey.responses.baitusers$Q3a=="June"|
                                            survey.responses.baitusers$Q3a=="July"|
                                            survey.responses.baitusers$Q3a=="August"|
                                            survey.responses.baitusers$Q3a=="September"|
                                            survey.responses.baitusers$Q3a=="October","open",
                                          "ice")

##Is there a seasonal difference in purchasing of golden shiners? 
summary(lm(data=survey.responses.baitusers,goldenshiners~season))
#no

##Is there a seasonal difference in number of trips taken?
n<-length(survey.responses.baitusers$PrimaryID)
for (i in 1:n) {
  survey.responses.baitusers$trips.in.summer[i]<-sum(survey.responses$Days_May18[i],
                                          survey.responses$Days_Jun18[i],
                                          survey.responses$Days_Jul18[i],
                                          survey.responses$Days_Aug18[i],
                                          survey.responses$Days_Sep18[i],
                                          survey.responses$Days_Oct18[i],na.rm = TRUE)
  
  survey.responses.baitusers$trips.in.winter[i]<-sum(survey.responses$Days_Jan18[i],
                                                     survey.responses$Days_Feb18[i],
                                                     survey.responses$Days_Mar18[i],
                                                     survey.responses$Days_Apr18[i],
                                                     survey.responses$Days_Nov18[i],
                                                     survey.responses$Days_Dec18[i],na.rm = TRUE)
                }

##Is there a seasonal difference in number of trips per season?
mean(survey.responses.baitusers$trips.in.summer,na.rm = TRUE)
mean(survey.responses.baitusers$trips.in.winter,na.rm = TRUE)
t.test(survey.responses.baitusers$trips.in.summer,survey.responses.baitusers$trips.in.winter)

##Significant difference in the number of trips. Fit distributions to each individually
fitdist(survey.responses.baitusers$trips.in.summer,distr = "exp", method = "mle")
fitdist(survey.responses.baitusers$trips.in.winter,distr = "exp", method = "mle")
mean(rexp(10000,rate = 0.0553))
mean(rexp(10000,rate=0.356))

#What is the maximum number of trips taken in each season (for the purposes of truncating exponential distributions in realistic place)
max(survey.responses.baitusers$trips.in.summer,na.rm = TRUE)
max(survey.responses.baitusers$trips.in.winter,na.rm = TRUE)

survey.responses$Q3a<-factor(survey.responses$Q3a,levels=c("January","February","March","April","May","June",
                                           "July","August","September",
                                           "October", "November", "December"))
survey.responses$season<-ifelse(survey.responses$Q3a=="May"|
                                            survey.responses$Q3a=="June"|
                                            survey.responses$Q3a=="July"|
                                            survey.responses$Q3a=="August"|
                                            survey.responses$Q3a=="September"|
                                            survey.responses$Q3a=="October","open",
                                          "ice")


## 5/26/20 below code was used for fitting dist. to total fish purchased. commenting now for later use
#fit poisson curve to data using optim
# minus.logL.s<-function(lambda, dat){
#   -sum(dpois(dat,lambda, log=TRUE))}
# 
# mle<-optim(par=45,fn=minus.logL.s, method="BFGS",hessian=T,dat=survey.responses.baitusers$fish.per.trip)
# mle
# 
# #simulate data coming from a poisson distribution of mean 38
# simspois<-as.data.frame(rpois(5000, 38)) 
# colnames(simspois)<-("simulated_values")
# 
# #fit negative binomial curve
# minus.logL.nb<-function(pars, dat){
#   mu<-pars[1]
#   size<-pars[2]
#   -sum(dnbinom(dat, mu=mu, size=size,log=TRUE))}
# mlenb<-optim(par=c(mu=38,size=1),fn=minus.logL.nb, method="BFGS",hessian=T,dat=survey.responses.baitusers$fish.per.trip)
# mlenb
# #simulate data coming from a negative binomial curve
# simsnegbin<-as.data.frame(rnbinom(3000,size=4, mu=38))
# colnames(simsnegbin)<-("simulated_valuesnb")
# 
# #fit negative binomial curve with alternative parameterization
# minus.logL.nb.p<-function(pars, dat){
#   p<-pars[1]
#   size<-pars[2]
#   -sum(dnbinom(dat, prob=p, size=size,log=TRUE))}
# 
# mlenbp<-optim(par=c(prob=0.3,size=5),fn=minus.logL.nb.p, method="BFGS",hessian=T,dat=survey.responses.baitusers$fish.per.trip)
# mlenbp
# simsnegbinp<-as.data.frame(rnbinom(700, size=4, p=0.1))
# colnames(simsnegbinp)<-("simulated_valuesnbp")
# 
# #fit weibull distribution
# minus.logL.weib<-function(pars, dat){
#   shape<-pars[1]
#   scale<-pars[2]
#   -sum(dweibull(dat, shape=shape, scale=scale,log=TRUE))}
# 
# mleweib<-optim(par=c(shape=30,scale=1),fn=minus.logL.weib, method="BFGS",hessian=T,dat=survey.responses.baitusers$fish.per.trip)
# mleweib
# 
# simsweibs<-as.data.frame(rweibull(10000,shape=2,scale=43))
# colnames(simsweibs)<-("simulated_valuesweib")
# 
# ggplot(simsweibs)+aes(simulated_valuesweib)+geom_histogram(aes(y=..density..))+geom_density(data=survey.responses.baitusers,aes(fish.per.trip))
# ggplot(survey.responses.baitusers)+geom_density
# 
# ggplot(simsweibs)+aes(simulated_valuesweib)+geom_density(alpha=0.2, fill="blue")+geom_histogram(data=survey.responses.baitusers,aes(x=fish.per.trip,y=..density..,alpha=0.7,fill="blue"))
# 
# #Note: spike at 34 is probably an artifact of people estimating since it's 2 dozen + 10.
# #graph both
# graph<-ggplot(survey.responses.baitusers)+aes(fish.per.trip)+geom_histogram()+geom_smooth(data=simspois, aes(simulated_values), stat = "count",color="red")+geom_smooth(data=simsnegbin, aes(simulated_valuesnb), stat="count", color="blue")
# graph


#Plot number of trips per year and whether or not a person released 
#make a column for binary yes/no release or not
#survey.responses.baitusers$Q16_1_release<-ifelse(survey.responses.baitusers$disposal_method=="Release in water","yes","no")
#trips.plot<-ggplot(survey.responses.baitusers)+
#   aes(x=trips.per.year, fill=Q16_1_release) + 
#   geom_histogram(bins = 100) + stat_bin(bins = 50) + 
#   ggtitle("Number of trips per angler per year") +
#   xlab("Number of trips")+
#   theme_bw()+
#   line(x=27,)
#   theme(text = element_text(size=24), axis.text.x = element_text(size=24), axis.text.y = element_text(size=24))+
#   labs(fill="Release")
# trips.plot
mean(survey.responses.baitusers$trips.per.year)
  
##what is the relationship between trips per year and release probability?
#create new column of ones and zeros for release
survey.responses.baitusers$releasebinom<-ifelse(survey.responses.baitusers$Q16_1_release=="yes",1,0)

summary(glm(data=survey.responses.baitusers,Q16_1_release~trips.per.year, family="binomial")) #relationship between trips per year and release probability for bait users
summary(lm(data=survey.responses,trips.per.year~live.bait)) #relationship between trips per year and use of live bait 
summary(lm(data=survey.responses,trips.per.year~Q16_1_release))
#Within bait users it seems as though there is no significant effect of trips per year 
# (a measure of avidity) on release probability, however bait users take more trips per year on 
# average than non bait users.

#fit exponential distribution to the number of trips for all anglers
minus.logL.exp<-function(rate, dat){
  -sum(dexp(dat,rate=rate, log=TRUE))}

mlexp<-optim(par=1,fn=minus.logL.exp, method="BFGS",hessian=T,dat=survey.responses$trips.per.year)
mlexp
#estimated rate parameter using the above is 0.041

#using fitdistrplus instead of optim
library(fitdistrplus)
fitdist(survey.responses$trips.per.year,"exp",method="mle")
#This is also estimating a "rate" of 0.041

#Simulating exponentially distributed data
simsexp<-as.data.frame(rexp(10000,rate=mlexp$par))
colnames(simsexp)<-("simulated_valuesexp")

ggplot(simsexp)+aes(simulated_valuesexp)+geom_density(alpha=0.2, fill="blue")+geom_histogram(data=survey.responses,aes(x=trips.per.year,y=..density..,alpha=0.7,fill="blue"))

#k-s test for differences between simulated and actual data:
ks.test(simsexp,survey.responses.baitusers$trips.per.year)
#KS test finds p=0.01 indicating significant differences between empirical and theoretical distribution.

##Exponential distribution with rate=0.041 is indicated by both optim and fit distr functions but it's not great.
#large spike at number of trips=5. Assuming this is an artifact of angler recall bias and ignoring; proceeding with exponential fit for this distrubiont

#However, there are significant differences in number of trips per year between bait users and non-bait users as well: 

#fit exponential distribution to the number of trips for JUST BAIT USERS

mlexpb<-optim(par=1,fn=minus.logL.exp, method="BFGS",hessian=T,dat=survey.responses.baitusers$trips.per.year)
mlexpb
#rate estimate=0.035

#using fitdistrplus instead of optim
fitdist(survey.responses.baitusers$trips.per.year,"exp",method="mle")
#This is also estimating a "rate" of 0.035

#Simulating exponentially distributed data
simsexpb<-as.data.frame(rexp(10000,rate=mlexpb$par))
colnames(simsexpb)<-("simulated_valuesexpb")

ggplot(simsexpb)+aes(simulated_valuesexpb)+geom_density(alpha=0.2, fill="blue")+geom_histogram(data=survey.responses.baitusers,aes(x=trips.per.year,y=..density..,alpha=0.7,fill="blue"))

#Poisson?
fitpois<-fitdist(survey.responses.baitusers$trips.per.year,"pois",method = "mle")

#Simulating poisson distributed data
simspoisb<-as.data.frame(rpois(10000,lambda = 27.95))
colnames(simspoisb)<-("simulated_valuespoisb")

ggplot(simspoisb)+aes(simulated_valuespoisb)+geom_density(alpha=0.2, fill="blue")+geom_histogram(data=survey.responses.baitusers,aes(x=trips.per.year,y=..density..,alpha=0.7,fill="blue"))

ks.test(simspoisb,survey.responses.baitusers$trips.per.year) #horrible fit
ks.test(simsexpb,survey.responses.baitusers$trips.per.year) #good fit

#KS test indicates good fit for the exponential distribution with rate 0.035, and very pooer fit for poisson
# to empirical distribution of trips per year among bait users

###Map percentage of live bait release in different zones/regions
survey.responses %>% group_by(purchase_fisha,released) %>%
summarize(n=n())

survey.responses <-  rename(survey.responses, Method_Release="Q16_1")
survey.responses <- rename(survey.responses,Method_Land="Q16_2") %>%
  rename(Method_Friend="Q16_3") %>%
  rename(Method_Other="Q16_4") %>%
  rename(Method_NoLeftover="Q16_5") 

#make long
survey.responses.long<-gather(survey.responses,"Disposal_Method", "Choice", Method_Release, Method_Land, Method_Friend, Method_Other, Method_NoLeftover) 
survey.responses.long$Choice <- replace_na(survey.responses.long$Choice, "no")

survey.responses.long <-select(survey.responses,Choice=="Released into the water")  #remove "no" choices to include in graph

  # Creating a data table of disposal method by fishing
library(data.table)
#mike <- data.table(postcard.data.long) 
#mikeb <- data.table(postcard.data)
responses.table<-data.table(survey.responses)
responses.table[ , .N, by=.(purchase_zonea,released)]
mikeb[, .N, by = .(Location)] #gives number of respondents by location
mike[ , .N, by = .(Location, Disposal_Method)] #gives number of respondents by disposal method and by location

mikec <- merge(mike[ , .N, by = .(Location, Disposal_Method)],mikeb[, .N, by = .(Location)], by = "Location") #Creates one dataframe with number of respondents by disposal method per location as well as total number of respondents per location
mikec[,prop := N.x/N.y , ]


#See "C:\Users\thom4412\Box\1824444_Thompson\merging data.R" for code for creating the merged file

###Scenario analysis: statewide outbreak of VHSV in FHM and em and spt shiners

#Create variable to randomly assign whether an angler bought susc shiners
x<-c(0,1)
susc<-c()
for (i in 1:length(survey.responses.baitusers$OSH.yn)){
  if(survey.responses.baitusers$OSH.yn[i]==1){
    susc[i] = sample(x,size=1,prob=c(0.16,0.84),replace=TRUE)
  }
  else if(survey.responses.baitusers$OSH.yn[i]==0){susc[i]=0}
}

#Add new variable column to dataframe
survey.responses.baitusers$use.OSH.susc<-susc

#Create variables to identify whether an angler used FHM or susc shiners and how many of those species they bought
for (i in 1:length(survey.responses.baitusers$OSH.yn)){
  #use susc?
  survey.responses.baitusers$use.VHSV.susc[i]<-ifelse(survey.responses.baitusers$use.OSH.susc[i]==1|survey.responses.baitusers$FHM.yn[i]==1,1,0) #create variable for using susceptible if either FHM or susc OSH or both are used
  
  #number of susc?
  survey.responses.baitusers$num.VHSV.susc[i]<-survey.responses.baitusers$fatheadminnows[i] + #number of fathead minnows +
  ifelse(survey.responses.baitusers$use.OSH.susc[i]==1,survey.responses.baitusers$othershiners[i],0) #number of other shiners, if they're susceptible
}

survey.responses.baitusers %>% group_by(OSH.yn) %>% summarize(n=n())
survey.responses.baitusers %>% group_by(FHM.yn) %>% summarize(n=n())
survey.responses.baitusers %>% group_by(use.OSH.susc) %>% summarize(n=n(),mean=mean(othershiners,na.rm = TRUE))
survey.responses.baitusers %>% group_by(use.VHSV.susc) %>% summarize(n=n()) #82% of bait users use either FHM or susc OSH

#Fit distribution to number of vhsv susc fishes (including fatheads) purchased
VHSVusers<-filter(survey.responses.baitusers,num.VHSV.susc>0)
fitdistrplus::fitdist(VHSVusers$num.VHSV.susc,dexp,"mle")
mean(VHSVusers$num.VHSV.susc)
max(VHSVusers$num.VHSV.susc)
mean(rexp(1000,rate=0.0275))

#Fit distribution to number of other shiners purchased
OSHusers<-filter(survey.responses.baitusers,use.OSH.susc==1)
OSHusers<-filter(OSHusers,othershiners>0) #remove rows where number of fish was coerced from "x" to NA to estimate numbers 

fitdistrplus::fitdist(OSHusers$othershiners,dexp,"mle")
max(OSHusers$othershiners)
mean(OSHusers$othershiners)

###Scenario analysis: outbreak of VHSV in Lake Superior Watershed (zone 3)
#create dataframe that only includes anglers who purchased or harvested bait in zone 3
localized.VHSV.Outbreak<-subset(survey.responses.baitusers, purchase_zonea==3|purchase_zoneb==3|purchase_zonec==3|harvest_zonea==3|harvest_zonea==3|harvest_zoneb==3|harvest_zonec==3)

#How many bait users get bait from this area? 42/480 bait users=8.75% of bait users get bait from this watershed
length(localized.VHSV.Outbreak$PrimaryID)

#What's the probability of an angler in this area using "other shiners"/VHSV susc. fish?
localized.VHSV.Outbreak %>% group_by(OSH.yn,live.bait) %>% summarize(n=n(), mean=mean(othershiners,na.rm = TRUE))
#10/42=23.8% anglers used other shiners from this area; 1 harvested 9 purchased

#Compared to other regions, seems like there's some that are more and some that are less: 
survey.responses.baitusers %>% group_by(purchase_zonea,OSH.yn) %>% summarize(n=n(), mean=mean(othershiners,na.rm = TRUE))

#What is the probability of an angler in this area using n other shiners,given that they use other shiners? 
OSH.users.zone3<-subset(localized.VHSV.Outbreak,OSH.yn==1) #people who use shiners in zone 3
hist(OSHusers$othershiners)
hist(OSH.users.zone3$othershiners)

mean(OSH.users.zone3$othershiners, na.rm = TRUE)
mean(OSH.users$othershiners,na.rm = TRUE)
t.test(OSH.users$othershiners,OSH.users.zone3$othershiners)
#t.test shows no sig. difference in average number of fish purchased 

#Use OSH-buying info from all anglers (parameterized in @risk by jan) b/c no sig differences apparently and there's not enough data w/in this subset to fit a distribution

#How many trips do anglers in this zone take?
mean(OSH.users.zone3$trips.per.year,na.rm = TRUE)
t.test(survey.responses.baitusers$trips.per.year,localized.VHSV.Outbreak$trips.per.year)
ks.test(survey.responses.baitusers$trips.per.year,localized.VHSV.Outbreak$trips.per.year)
#no sig difference for OSH users or anglers in this zone, so going to use same distribution as general pop.

#What's the probability of an angler in this zone releasing leftover live baitfish?
localized.VHSV.Outbreak %>% group_by(Q16_1_release) %>% summarize(n=n())
#approx. 16%...is this signicantly lower than for general state anglers? NO

#where do anglers who buy fish in this zone go?
localized.VHSV.Outbreak %>% group_by(fish_zone1) %>% summarize(n=n())
localized.VHSV.Outbreak %>% group_by(fish_zone2) %>% summarize(n=n())
localized.VHSV.Outbreak %>% group_by(fish_zone3) %>% summarize(n=n())
localized.VHSV.Outbreak %>% group_by(fish_zone4) %>% summarize(n=n())
localized.VHSV.Outbreak %>% group_by(fish_zone5) %>% summarize(n=n())
localized.VHSV.Outbreak %>% group_by(fish_zone6) %>% summarize(n=n())
localized.VHSV.Outbreak %>% group_by(fish_zone7) %>% summarize(n=n())
localized.VHSV.Outbreak %>% group_by(fish_zone8) %>% summarize(n=n())
localized.VHSV.Outbreak %>% group_by(fish_zone9) %>% summarize(n=n())
localized.VHSV.Outbreak %>% group_by(fish_zone10) %>% summarize(n=n())
localized.VHSV.Outbreak %>% group_by(fish_zone11) %>% summarize(n=n())
localized.VHSV.Outbreak %>% group_by(fish_zone12) %>% summarize(n=n())

#Wrote data into spreadsheetusing the following code on 7/21/20: 
write.csv(survey.responses.baitusers$num.VHSV.susc, file="numberVHSSuscFish_state.csv", )
write.csv(localized.VHSV.Outbreak$num.VHSV.susc, file="numberVHSSuscFish_local.csv", )


#Check for crossing watershed boundary: 
survey.responses.baitusers$cross_zonea<-NA
survey.responses.baitusers$cross_zoneb<-NA
survey.responses.baitusers$cross_zonec<-NA


for (i in 1:length(survey.responses.baitusers$PrimaryID)){
  survey.responses.baitusers$cross_zonea[i]<-ifelse(survey.responses.baitusers$purchase_zonea[i]==survey.responses.baitusers$purchase_fisha[i],0,1)
  survey.responses.baitusers$cross_zoneb[i]<-ifelse(survey.responses.baitusers$purchase_zoneb[i]==survey.responses.baitusers$purchase_fishb[i],0,1)
  survey.responses.baitusers$cross_zonec[i]<-ifelse(survey.responses.baitusers$purchase_zonec[i]==survey.responses.baitusers$purchase_fishc[i],0,1)
  
}

#sum to find total: 
sum(survey.responses.baitusers$cross_zonea,na.rm = TRUE)

#How often did bait come in from out of state? 
occurrences<-table(unlist(survey.responses.baitusers))
occurrences["Outside MN"]

#code for writing csvs
write.csv(survey.responses, file="allresponses.csv")
write.csv(survey.responses.baitusers,file="baitusers.csv")