# MODEL: Family richness of Araneae with Poisson distribution

# INPUT:  - Arthropod ID and measuring data: combined_id_meas_manually_cleaned.csv
#         - Vegetation data: all_veg_cc.csv

# OUTPUT: - Table of model step 1 results with covariate estimates, 95% confidence 
#           intervals, p-values, marginal R2 values, and conditional R2 values
#         - Table of model step 2 results with covariate estimates, 95% confidence 
#           intervals, p-values, marginal R2 values, and conditional R2 values


# Load libraries
library(dplyr)
library(tidyr)
library(forcats)
library(ggplot2)
library(tibble)
library(MASS)
library(r2glmm)
library(nlme)
library(MuMIn)

# Read in arthropod and vegetation data
dat = read.csv("data/20230303_combined_id_meas_manually_cleaned.csv")  # arthropods
veg = read.csv("data/20190610_all_veg_cc.csv")  # vegetation

################# Calculate arthropod biomass ##################################

##### Functions: biomass by order - Straus & Aviles 2018)

# Coleoptera
bio.col <- function(x) {
  mg.col = 10^(-1.960 + 2.966*(log10(x)))
  return(mg.col)
}

# Orthoptera
bio.ort <- function(x) {
  mg.ort = 10^(-1.928 + 2.680*(log10(x)))
  return(mg.ort)
}

# Hymenoptera
bio.hym <- function(x) {
  mg.hym = 10^(-0.611 + 1.141*(log10(x)))
  return(mg.hym)
}

# Lepidoptera
bio.lep <- function(x) {
  mg.lep = 10^(-3.169 + 3.624*(log10(x)))
  return(mg.lep)
}

# Hemiptera
bio.hem <- function(x) {
  mg.hem = 10^(-1.749 + 2.529*(log10(x)))
  return(mg.hem)
}

# Diptera
bio.dip <- function(x) {
  mg.dip = 10^(-1.361 + 2.026*(log10(x)))
  return(mg.dip)
}

# Araneae
bio.ara <- function(x) {
  mg.ara = 10^(-2.643 + 4.011*(log10(x)))
  return(mg.ara)
}

# All groups
bio.all <- function(x) {
  mg.all = 10^(-1.412 + 2.085*(log10(x)))
  return(mg.all)
}

# Test functions
y = 20
bio.ort(y)
bio.col(y)
bio.hym(y)
bio.lep(y)
bio.hem(y)
bio.dip(y)
bio.ara(y)
bio.all(y)

# Make all columns character
dat[] = lapply(dat, as.character)

# Adjust data types
dat = dat %>%
  mutate_at(c("key",
              "code",
              "year",
              "site", 
              "timing",
              "transect",
              "distance",
              "ID",
              "ID_family",
              "ID_order"), 
            as.factor) %>% # columns to be factors
  mutate_at(c("mm_1",
              "mm_2",
              "mm_3",
              "mm_4",
              "mm_5",
              "mm_6",
              "mm_7",
              "mm_8",
              "mm_9",
              "mm_10",
              "ct_1",
              "ct_2",
              "ct_3",
              "ct_4",
              "ct_5",
              "ct_6",
              "ct_7",
              "ct_8",
              "ct_9",
              "ct_10"), 
            as.numeric) # Columns to be numeric. Warnings: "." changed to NA.

# Make family & order "." = NA
dat$ID_family[dat$ID_family == "."] = NA
dat$ID_order[dat$ID_order == "."] = NA

# Biomass Coleoptera
dat$bio_1= ifelse(dat$ID_order == "Coleoptera", bio.col(dat$mm_1), NA)
dat$bio_2= ifelse(dat$ID_order == "Coleoptera", bio.col(dat$mm_2), NA)
dat$bio_3= ifelse(dat$ID_order == "Coleoptera", bio.col(dat$mm_3), NA)
dat$bio_4= ifelse(dat$ID_order == "Coleoptera", bio.col(dat$mm_4), NA)
dat$bio_5= ifelse(dat$ID_order == "Coleoptera", bio.col(dat$mm_5), NA)
dat$bio_6= ifelse(dat$ID_order == "Coleoptera", bio.col(dat$mm_6), NA)
dat$bio_7= ifelse(dat$ID_order == "Coleoptera", bio.col(dat$mm_7), NA)
dat$bio_8= ifelse(dat$ID_order == "Coleoptera", bio.col(dat$mm_8), NA)
dat$bio_9= ifelse(dat$ID_order == "Coleoptera", bio.col(dat$mm_9), NA)
dat$bio_10= ifelse(dat$ID_order == "Coleoptera", bio.col(dat$mm_10), NA)

# Biomass Orthoptera
dat$bio_1= ifelse(dat$ID_order == "Orthoptera", bio.ort(dat$mm_1), dat$bio_1)
dat$bio_2= ifelse(dat$ID_order == "Orthoptera", bio.ort(dat$mm_2), dat$bio_2)
dat$bio_3= ifelse(dat$ID_order == "Orthoptera", bio.ort(dat$mm_3), dat$bio_3)
dat$bio_4= ifelse(dat$ID_order == "Orthoptera", bio.ort(dat$mm_4), dat$bio_4)
dat$bio_5= ifelse(dat$ID_order == "Orthoptera", bio.ort(dat$mm_5), dat$bio_5)
dat$bio_6= ifelse(dat$ID_order == "Orthoptera", bio.ort(dat$mm_6), dat$bio_6)
dat$bio_7= ifelse(dat$ID_order == "Orthoptera", bio.ort(dat$mm_7), dat$bio_7)
dat$bio_8= ifelse(dat$ID_order == "Orthoptera", bio.ort(dat$mm_8), dat$bio_8)
dat$bio_9= ifelse(dat$ID_order == "Orthoptera", bio.ort(dat$mm_9), dat$bio_9)
dat$bio_10= ifelse(dat$ID_order == "Orthoptera", bio.ort(dat$mm_10), dat$bio_10)

# Biomass Hymenoptera
dat$bio_1= ifelse(dat$ID_order == "Hymenoptera", bio.hym(dat$mm_1), dat$bio_1)
dat$bio_2= ifelse(dat$ID_order == "Hymenoptera", bio.hym(dat$mm_2), dat$bio_2)
dat$bio_3= ifelse(dat$ID_order == "Hymenoptera", bio.hym(dat$mm_3), dat$bio_3)
dat$bio_4= ifelse(dat$ID_order == "Hymenoptera", bio.hym(dat$mm_4), dat$bio_4)
dat$bio_5= ifelse(dat$ID_order == "Hymenoptera", bio.hym(dat$mm_5), dat$bio_5)
dat$bio_6= ifelse(dat$ID_order == "Hymenoptera", bio.hym(dat$mm_6), dat$bio_6)
dat$bio_7= ifelse(dat$ID_order == "Hymenoptera", bio.hym(dat$mm_7), dat$bio_7)
dat$bio_8= ifelse(dat$ID_order == "Hymenoptera", bio.hym(dat$mm_8), dat$bio_8)
dat$bio_9= ifelse(dat$ID_order == "Hymenoptera", bio.hym(dat$mm_9), dat$bio_9)
dat$bio_10= ifelse(dat$ID_order == "Hymenoptera", bio.hym(dat$mm_10), dat$bio_10)

# Biomass Lepidoptera
dat$bio_1= ifelse(dat$ID_order == "Lepidoptera", bio.lep(dat$mm_1), dat$bio_1)
dat$bio_2= ifelse(dat$ID_order == "Lepidoptera", bio.lep(dat$mm_2), dat$bio_2)
dat$bio_3= ifelse(dat$ID_order == "Lepidoptera", bio.lep(dat$mm_3), dat$bio_3)
dat$bio_4= ifelse(dat$ID_order == "Lepidoptera", bio.lep(dat$mm_4), dat$bio_4)
dat$bio_5= ifelse(dat$ID_order == "Lepidoptera", bio.lep(dat$mm_5), dat$bio_5)
dat$bio_6= ifelse(dat$ID_order == "Lepidoptera", bio.lep(dat$mm_6), dat$bio_6)
dat$bio_7= ifelse(dat$ID_order == "Lepidoptera", bio.lep(dat$mm_7), dat$bio_7)
dat$bio_8= ifelse(dat$ID_order == "Lepidoptera", bio.lep(dat$mm_8), dat$bio_8)
dat$bio_9= ifelse(dat$ID_order == "Lepidoptera", bio.lep(dat$mm_9), dat$bio_9)
dat$bio_10= ifelse(dat$ID_order == "Lepidoptera", bio.lep(dat$mm_10), dat$bio_10)

# Biomass Hemiptera
dat$bio_1= ifelse(dat$ID_order == "Hemiptera", bio.hem(dat$mm_1), dat$bio_1)
dat$bio_2= ifelse(dat$ID_order == "Hemiptera", bio.hem(dat$mm_2), dat$bio_2)
dat$bio_3= ifelse(dat$ID_order == "Hemiptera", bio.hem(dat$mm_3), dat$bio_3)
dat$bio_4= ifelse(dat$ID_order == "Hemiptera", bio.hem(dat$mm_4), dat$bio_4)
dat$bio_5= ifelse(dat$ID_order == "Hemiptera", bio.hem(dat$mm_5), dat$bio_5)
dat$bio_6= ifelse(dat$ID_order == "Hemiptera", bio.hem(dat$mm_6), dat$bio_6)
dat$bio_7= ifelse(dat$ID_order == "Hemiptera", bio.hem(dat$mm_7), dat$bio_7)
dat$bio_8= ifelse(dat$ID_order == "Hemiptera", bio.hem(dat$mm_8), dat$bio_8)
dat$bio_9= ifelse(dat$ID_order == "Hemiptera", bio.hem(dat$mm_9), dat$bio_9)
dat$bio_10= ifelse(dat$ID_order == "Hemiptera", bio.hem(dat$mm_10), dat$bio_10)

# Biomass Diptera
dat$bio_1= ifelse(dat$ID_order == "Diptera", bio.dip(dat$mm_1), dat$bio_1)
dat$bio_2= ifelse(dat$ID_order == "Diptera", bio.dip(dat$mm_2), dat$bio_2)
dat$bio_3= ifelse(dat$ID_order == "Diptera", bio.dip(dat$mm_3), dat$bio_3)
dat$bio_4= ifelse(dat$ID_order == "Diptera", bio.dip(dat$mm_4), dat$bio_4)
dat$bio_5= ifelse(dat$ID_order == "Diptera", bio.dip(dat$mm_5), dat$bio_5)
dat$bio_6= ifelse(dat$ID_order == "Diptera", bio.dip(dat$mm_6), dat$bio_6)
dat$bio_7= ifelse(dat$ID_order == "Diptera", bio.dip(dat$mm_7), dat$bio_7)
dat$bio_8= ifelse(dat$ID_order == "Diptera", bio.dip(dat$mm_8), dat$bio_8)
dat$bio_9= ifelse(dat$ID_order == "Diptera", bio.dip(dat$mm_9), dat$bio_9)
dat$bio_10= ifelse(dat$ID_order == "Diptera", bio.dip(dat$mm_10), dat$bio_10)

# Biomass Araneae
dat$bio_1= ifelse(dat$ID_order == "Araneae", bio.ara(dat$mm_1), dat$bio_1)
dat$bio_2= ifelse(dat$ID_order == "Araneae", bio.ara(dat$mm_2), dat$bio_2)
dat$bio_3= ifelse(dat$ID_order == "Araneae", bio.ara(dat$mm_3), dat$bio_3)
dat$bio_4= ifelse(dat$ID_order == "Araneae", bio.ara(dat$mm_4), dat$bio_4)
dat$bio_5= ifelse(dat$ID_order == "Araneae", bio.ara(dat$mm_5), dat$bio_5)
dat$bio_6= ifelse(dat$ID_order == "Araneae", bio.ara(dat$mm_6), dat$bio_6)
dat$bio_7= ifelse(dat$ID_order == "Araneae", bio.ara(dat$mm_7), dat$bio_7)
dat$bio_8= ifelse(dat$ID_order == "Araneae", bio.ara(dat$mm_8), dat$bio_8)
dat$bio_9= ifelse(dat$ID_order == "Araneae", bio.ara(dat$mm_9), dat$bio_9)
dat$bio_10= ifelse(dat$ID_order == "Araneae", bio.ara(dat$mm_10), dat$bio_10)

# Biomass other: for orders other than Coleoptera, Orthoptera, Hymenoptera,
# Lepidoptera, Hemiptera, Diptera, or Araneae
dat$bio_1= ifelse(dat$ID_order != "Coleoptera" & dat$ID_order != "Orthoptera" & dat$ID_order != "Hymenoptera" & dat$ID_order != "Lepidoptera" & dat$ID_order != "Hemiptera" & dat$ID_order != "Diptera" & dat$ID_order != "Araneae", bio.all(dat$mm_1), dat$bio_1)
dat$bio_2= ifelse(dat$ID_order != "Coleoptera" & dat$ID_order != "Orthoptera" & dat$ID_order != "Hymenoptera" & dat$ID_order != "Lepidoptera" & dat$ID_order != "Hemiptera" & dat$ID_order != "Diptera" & dat$ID_order != "Araneae", bio.all(dat$mm_2), dat$bio_2)
dat$bio_3= ifelse(dat$ID_order != "Coleoptera" & dat$ID_order != "Orthoptera" & dat$ID_order != "Hymenoptera" & dat$ID_order != "Lepidoptera" & dat$ID_order != "Hemiptera" & dat$ID_order != "Diptera" & dat$ID_order != "Araneae", bio.all(dat$mm_3), dat$bio_3)
dat$bio_4= ifelse(dat$ID_order != "Coleoptera" & dat$ID_order != "Orthoptera" & dat$ID_order != "Hymenoptera" & dat$ID_order != "Lepidoptera" & dat$ID_order != "Hemiptera" & dat$ID_order != "Diptera" & dat$ID_order != "Araneae", bio.all(dat$mm_4), dat$bio_4)
dat$bio_5= ifelse(dat$ID_order != "Coleoptera" & dat$ID_order != "Orthoptera" & dat$ID_order != "Hymenoptera" & dat$ID_order != "Lepidoptera" & dat$ID_order != "Hemiptera" & dat$ID_order != "Diptera" & dat$ID_order != "Araneae", bio.all(dat$mm_5), dat$bio_5)
dat$bio_6= ifelse(dat$ID_order != "Coleoptera" & dat$ID_order != "Orthoptera" & dat$ID_order != "Hymenoptera" & dat$ID_order != "Lepidoptera" & dat$ID_order != "Hemiptera" & dat$ID_order != "Diptera" & dat$ID_order != "Araneae", bio.all(dat$mm_6), dat$bio_6)
dat$bio_7= ifelse(dat$ID_order != "Coleoptera" & dat$ID_order != "Orthoptera" & dat$ID_order != "Hymenoptera" & dat$ID_order != "Lepidoptera" & dat$ID_order != "Hemiptera" & dat$ID_order != "Diptera" & dat$ID_order != "Araneae", bio.all(dat$mm_7), dat$bio_7)
dat$bio_8= ifelse(dat$ID_order != "Coleoptera" & dat$ID_order != "Orthoptera" & dat$ID_order != "Hymenoptera" & dat$ID_order != "Lepidoptera" & dat$ID_order != "Hemiptera" & dat$ID_order != "Diptera" & dat$ID_order != "Araneae", bio.all(dat$mm_8), dat$bio_8)
dat$bio_9= ifelse(dat$ID_order != "Coleoptera" & dat$ID_order != "Orthoptera" & dat$ID_order != "Hymenoptera" & dat$ID_order != "Lepidoptera" & dat$ID_order != "Hemiptera" & dat$ID_order != "Diptera" & dat$ID_order != "Araneae", bio.all(dat$mm_9), dat$bio_9)
dat$bio_10= ifelse(dat$ID_order != "Coleoptera" & dat$ID_order != "Orthoptera" & dat$ID_order != "Hymenoptera" & dat$ID_order != "Lepidoptera" & dat$ID_order != "Hemiptera" & dat$ID_order != "Diptera" & dat$ID_order != "Araneae", bio.all(dat$mm_10), dat$bio_10)

# Biomass other: for order = Unknown
dat$bio_1= ifelse(dat$ID_order == "Unknown", bio.all(dat$mm_1), dat$bio_1)
dat$bio_2= ifelse(dat$ID_order == "Unknown", bio.all(dat$mm_2), dat$bio_2)
dat$bio_3= ifelse(dat$ID_order == "Unknown", bio.all(dat$mm_3), dat$bio_3)
dat$bio_4= ifelse(dat$ID_order == "Unknown", bio.all(dat$mm_4), dat$bio_4)
dat$bio_5= ifelse(dat$ID_order == "Unknown", bio.all(dat$mm_5), dat$bio_5)
dat$bio_6= ifelse(dat$ID_order == "Unknown", bio.all(dat$mm_6), dat$bio_6)
dat$bio_7= ifelse(dat$ID_order == "Unknown", bio.all(dat$mm_7), dat$bio_7)
dat$bio_8= ifelse(dat$ID_order == "Unknown", bio.all(dat$mm_8), dat$bio_8)
dat$bio_9= ifelse(dat$ID_order == "Unknown", bio.all(dat$mm_9), dat$bio_9)
dat$bio_10= ifelse(dat$ID_order == "Unknown", bio.all(dat$mm_10), dat$bio_10)

# Biomass other: for order = NA
dat$bio_1= ifelse(is.na(dat$ID_order), bio.all(dat$mm_1), dat$bio_1)
dat$bio_2= ifelse(is.na(dat$ID_order), bio.all(dat$mm_2), dat$bio_2)
dat$bio_3= ifelse(is.na(dat$ID_order), bio.all(dat$mm_3), dat$bio_3)
dat$bio_4= ifelse(is.na(dat$ID_order), bio.all(dat$mm_4), dat$bio_4)
dat$bio_5= ifelse(is.na(dat$ID_order), bio.all(dat$mm_5), dat$bio_5)
dat$bio_6= ifelse(is.na(dat$ID_order), bio.all(dat$mm_6), dat$bio_6)
dat$bio_7= ifelse(is.na(dat$ID_order), bio.all(dat$mm_7), dat$bio_7)
dat$bio_8= ifelse(is.na(dat$ID_order), bio.all(dat$mm_8), dat$bio_8)
dat$bio_9= ifelse(is.na(dat$ID_order), bio.all(dat$mm_9), dat$bio_9)
dat$bio_10= ifelse(is.na(dat$ID_order), bio.all(dat$mm_10), dat$bio_10)

# Make columns numeric
dat = dat %>%
  mutate_at(c("bio_1",
              "bio_2",
              "bio_3",
              "bio_4",
              "bio_5",
              "bio_6",
              "bio_7",
              "bio_8",
              "bio_9",
              "bio_10"), 
            as.numeric)

# Multiply each biomass by count into new column mbio_#
dat = dat %>% 
  mutate(mbio_1 = bio_1*ct_1,
         mbio_2 = bio_2*ct_2,
         mbio_3 = bio_3*ct_3,
         mbio_4 = bio_4*ct_4,
         mbio_5 = bio_5*ct_5,
         mbio_6 = bio_6*ct_6,
         mbio_7 = bio_7*ct_7,
         mbio_8 = bio_8*ct_8,
         mbio_9 = bio_9*ct_9,
         mbio_10 = bio_10*ct_10)

# New column mbio_sum: sum of mbio_1-10
dat = dat %>% 
  mutate(mbio_sum = rowSums(dat[,c("mbio_1", "mbio_2", "mbio_3", "mbio_4", "mbio_5", "mbio_6", "mbio_7", "mbio_8", "mbio_9", "mbio_10")], 
                            na.rm = TRUE)) # exclude NAs

# New column count_sum: sum of ct_1-10
dat = dat %>% 
  mutate(count_sum = rowSums(dat[,c("ct_1", "ct_2", "ct_3", "ct_4", "ct_5", "ct_6", "ct_7", "ct_8", "ct_9", "ct_10")], 
                            na.rm = TRUE)) # exclude NAs
# Ideally, column "count_sum" should = column "count"

######################## Count number of vials in dataset #################################

dat %>% 
  distinct(site,timing,transect,distance,ID) %>% 
  tally() # 7640 vials

######################### Count vials with no calcs #########################################

# Count vials with no mbio_sum measurement
dat %>% 
  filter(mbio_sum == 0) %>% 
  tally() # 3 vials

# Count vials with no count_sum measurement
dat %>% 
  filter(count_sum == 0) %>% 
  tally () # 3 vials

##################### Summarize data by vials #############################################

# Each vial own row
vialsum = dat %>% 
  group_by(site, timing, transect, distance, ID, ID_family, ID_order, ID_notes) %>% 
  summarize(totalbio = sum(mbio_sum, na.rm = TRUE),
            totalct = sum(count_sum, na.rm = TRUE))

# Add column of site type (control/treatment)
vialsum$type = ifelse(vialsum$site == "HL", "Treatment", "Control")
vialsum$type = ifelse(vialsum$site == "LM", "Treatment", vialsum$type)
vialsum$type = ifelse(vialsum$site == "MS", "Treatment", vialsum$type)
vialsum$type = ifelse(vialsum$site == "SM", "Treatment", vialsum$type)
vialsum$type = ifelse(vialsum$site == "WW", "Treatment", vialsum$type)


############ Add Larvae/Adult to Lepidoptera ###############

# New column larva: "Y" if ID_notes contain words larva/larvae
vialsum = vialsum %>% 
  ungroup() %>% 
  mutate(larva = ifelse(grepl("larva|larvae", vialsum$ID_notes, ignore.case = TRUE), "Y", "N"))

# New column ID_order.new with new "Lepidoptera Larva" value
vialsum = vialsum %>% 
  mutate(ID_order.new = ifelse(ID_order == "Lepidoptera" & larva == "Y", "Lepidoptera Larva", as.character(ID_order)))

# Have vialsum only include new ID_order column
vialsum = vialsum %>% 
  dplyr::select(-ID_order, -ID_notes)

# Rename ID_order.new to ID_order
names(vialsum)[names(vialsum) == "ID_order.new"] <- "ID_order"

# Rename "Lepidoptera" to "Lepidoptera Adult"
vialsum$ID_order[vialsum$ID_order == 'Lepidoptera'] <- 'Lepidoptera Adult'

# Group data by site/timing/transect/distance/order/family/type - some arthropods with same family & order are split into separate vials
ins = vialsum %>% 
  group_by(site, timing, transect, distance, ID_order, ID_family, type) %>% 
  summarize(bio = sum(totalbio, na.rm = TRUE),
            ct = sum(totalct, na.rm = TRUE))


############ Correct for HL pre-spraying transects being longer than all other transects ######################

# HL pre-spraying transects were 60 m long while all other transects were 40 m long

# Multiply bio & ct by (2/3) for all HL pre-spraying samples
ins = ins %>% 
  mutate(bio = ifelse(site == "HL" & timing == "P", bio*(2/3), bio),
         ct = ifelse(site == "HL" & timing == "P", ct*(2/3), ct))


############################### Veg data ######################################

# Explore data types
sapply(veg, class)

# Make columns numeric. Need to change to character first, since these columns currently
# contain factors. Columns 40-50 (canopy cover) are already numeric.
veg[, c(19:26, 32:35)] = lapply(veg[, c(19:26, 32:35)], as.character)
veg[, c(19:26, 32:35)] = lapply(veg[, c(19:26, 32:35)], as.numeric)


# Create column vor_mean (average n/s/e/w readings)
for (i in 1:nrow(veg)){
  veg$vor_mean[i] = ((veg$vor_n[i] + veg$vor_e[i] + veg$vor_s[i] + veg$vor_w[i])/4)
}

# Create column gc.lit.p: percentage of ground cover that is litter
for (i in 1:nrow(veg)){
  veg$gc.lit.p[i] = (((veg$gc_lit[i])/30)*100)
}

# Create column gc.bg.p: percentage of ground cover that is bare ground
for (i in 1:nrow(veg)){
  veg$gc.bg.p[i] = (((veg$gc_bg[i])/30)*100)
}

# Create column gc.oth.p - percentage of ground cover that is other
for (i in 1:nrow(veg)){
  veg$gc.oth.p[i] = (((veg$gc_oth[i])/30)*100)
}

# Create column cc.live.p: percentage of canopy cover consisting of grass, forb, or woody
for (i in 1:nrow(veg)){
  veg$cc.live.p[i] = (veg$grass.p.adj[i] + veg$forb.p.adj[i] + veg$woody.p.adj[i])
}

# Create column sp_total: sum of sp_grass + sp_forb
for (i in 1:nrow(veg)){
  veg$sp_total[i] = (veg$sp_grass[i] + veg$sp_forb[i])
}

# Create new column vor_field with vor reading from direction of field
veg$vor_field = ifelse(veg$site=="FC", veg$vor_w, ".") # populate other cells with "."
veg$vor_field = ifelse(veg$site=="WF", veg$vor_s, veg$vor_field)
veg$vor_field = ifelse(veg$site=="HL", veg$vor_w, veg$vor_field)
veg$vor_field = ifelse(veg$site=="MS", veg$vor_e, veg$vor_field)
veg$vor_field = ifelse(veg$site=="SM", veg$vor_e, veg$vor_field)
veg$vor_field = ifelse(veg$site=="WW", veg$vor_w, veg$vor_field)
veg$vor_field = ifelse(veg$site=="DH", veg$vor_w, veg$vor_field)
veg$vor_field = ifelse(veg$site=="DH" & veg$transect == "T" & veg$timing == "20", veg$vor_n, veg$vor_field) # T2 transect only will use vor_n
veg$vor_field = ifelse(veg$site=="LM", veg$vor_w, veg$vor_field)
veg$vor_field = ifelse(veg$site=="RH", veg$vor_w, veg$vor_field)

# Make this column numeric
veg$vor_field = as.numeric(veg$vor_field)

# Rename columns 
names(veg)[names(veg) == 'grass.p.adj'] <- 'cc.grass.p'
names(veg)[names(veg) == 'forb.p.adj'] <- 'cc.forb.p'
names(veg)[names(veg) == 'dead.p.adj'] <- 'cc.dead.p'
names(veg)[names(veg) == 'woody.p.adj'] <- 'cc.woody.p'
names(veg)[names(veg) == 'other.p.adj'] <- 'cc.other.p'

# Remove white space in column cc_comment
veg$cc_comment = as.character(veg$cc_comment)
veg$cc_comment = trimws(veg$cc_comment, which = c("both"))

# Make cc_comment = NA and blank to "."
veg$cc_comment[is.na(veg$cc_comment)] = "."
veg$cc_comment[veg$cc_comment == ""] = "."

# Subset "veg" to include select variables only
veg.sub = veg %>% 
  filter(!(timing == "P" & plot == "R")) %>% # exclude pre-spray R plots
  # keep only C and L plots from pre-spraying(start/end of insect collection in ethanol)
  dplyr::select(site,
         timing,
         transect,
         distance,
         plot,
         lit,
         mhl,
         mhd,
         sp_grass,
         sp_forb,
         sp_total,
         gc.lit.p,
         gc.bg.p,
         gc.oth.p,
         cc.grass.p,
         cc.forb.p,
         cc.dead.p,
         cc.woody.p,
         cc.other.p,
         vor_mean,
         vor_field)

# Calculate means
veg.mean = veg.sub %>% 
  group_by(site, timing, transect, distance) %>% 
  summarize(lit = mean(lit),
            mhl = mean(mhl),
            mhd = mean(mhd),
            sp_grass = mean(sp_grass),
            sp_forb = mean(sp_forb),
            sp_total = mean(sp_total),
            gc.lit.p = mean(gc.lit.p),
            gc.bg.p = mean(gc.bg.p),
            gc.oth.p = mean(gc.oth.p),
            cc.grass.p = mean(cc.grass.p),
            cc.forb.p = mean(cc.forb.p),
            cc.dead.p = mean(cc.dead.p),
            cc.woody.p = mean(cc.woody.p),
            cc.other.p = mean(cc.other.p),
            vor_mean = mean(vor_mean),
            vor_field = mean(vor_field))

############################# Join veg and insect data ######################

# All to character
veg.mean[] = lapply(veg.mean, as.character)
ins[] = lapply(ins, as.character)

# Combine
ins.veg = ins %>% 
  left_join(veg.mean, by = c("site", "timing", "transect", "distance"))

# Create new column "year"
ins.veg$year = ifelse((ins.veg$site == "DH" | ins.veg$site == "HL" |
                         ins.veg$site == "RH" | ins.veg$site == "LM"),
                      "2017", "2018")

# Adjust data types
ins.veg = ins.veg %>%
  mutate_at(c("site", 
              "timing",
              "transect",
              "ID_order",
              "ID_family",
              "type",
              "year"), 
            as.factor) %>% # columns to be factors
  mutate_at(c("distance",
              "bio",
              "ct",
              "lit",
              "mhl",
              "mhd",
              "sp_grass",
              "sp_forb",
              "sp_total",
              "gc.lit.p",
              "gc.bg.p",
              "gc.oth.p",
              "cc.grass.p",
              "cc.forb.p",
              "cc.dead.p",
              "cc.woody.p",
              "cc.other.p",
              "vor_mean",
              "vor_field"), 
            as.numeric) # columns to be numeric

# Exclude 1 observation with timing = "."
ins.veg = ins.veg %>% 
  filter(timing != ".")

# Exclude 1 observation from HL P T 25 - did not collect on transect T pre-spraying
ins.veg = ins.veg %>% 
  filter(!(timing == "P" & transect == "T"))


###################################################################################
###################################################################################

##### Calculate total and bird food abundance/biomass and family richness #####

### Calculate total abundance and total biomass

dat.total = ins.veg %>% 
  group_by(site, timing, transect, distance) %>% 
  mutate(ct.sum.total = sum(ct),
         bio.sum.total = sum(bio)) %>% 
  distinct(site, timing, transect, distance, .keep_all = TRUE) %>% 
  dplyr::select(-ID_order, -ID_family, -bio, -ct)

### Calculate bird food abundance and biomass

dat.bf = ins.veg %>% 
  filter(ID_order == "Coleoptera" | ID_order == "Lepidoptera Larva" |  ID_order == "Orthoptera" | ID_order == "Araneae") %>% 
  group_by(site, timing, transect, distance) %>% 
  mutate(ct.sum.bf = sum(ct),
         bio.sum.bf = sum(bio)) %>% 
  distinct(site, timing, transect, distance, .keep_all = TRUE) %>% 
  dplyr::select(site, timing, transect, distance, ct.sum.bf, bio.sum.bf)
# There is 1 fewer observation in bf (296) than total (297): 1 vial didn't have any of the bf groups.
# When joined to dat.total, ct.sum.bf and bio.sum.bf will be NA for this vial (DH 20 Z 25).


##### Calculate family richness #####

### Araneae

# Calculate family richness of Araneae by vial, exclude missing family ID
dat.ara = ins.veg %>% 
  filter((ID_order == "Araneae") & ID_family != "(Missing)") %>% 
  group_by(site, timing, transect, distance) %>% 
  mutate(fam.ara = length(unique(ID_family))) %>% 
  distinct(site, timing, transect, distance, .keep_all = TRUE) %>% 
  dplyr::select(site, transect, timing, distance, fam.ara)

### Coleoptera

# Calculate family richness of Coleoptera by vial, exclude missing family ID
dat.col = ins.veg %>% 
  filter((ID_order == "Coleoptera") & ID_family != "(Missing)") %>% 
  group_by(site, timing, transect, distance) %>% 
  mutate(fam.col = length(unique(ID_family))) %>% 
  distinct(site, timing, transect, distance, .keep_all = TRUE) %>% 
  dplyr::select(site, transect, timing, distance, fam.col)

### Hemiptera

# Calculate family richness of Hemiptera by vial, exclude missing family ID
dat.hem = ins.veg %>% 
  filter((ID_order == "Hemiptera") & ID_family != "(Missing)") %>% 
  group_by(site, timing, transect, distance) %>% 
  mutate(fam.hem = length(unique(ID_family))) %>% 
  distinct(site, timing, transect, distance, .keep_all = TRUE) %>% 
  dplyr::select(site, transect, timing, distance, fam.hem)

### Orthoptera

# Calculate family richness of Orthoptera by vial, exclude missing family ID
dat.ort = ins.veg %>% 
  filter((ID_order == "Orthoptera") & ID_family != "(Missing)") %>% 
  group_by(site, timing, transect, distance) %>% 
  mutate(fam.ort = length(unique(ID_family))) %>% 
  distinct(site, timing, transect, distance, .keep_all = TRUE) %>% 
  dplyr::select(site, transect, timing, distance, fam.ort)

##### Join newly calculated columns to dat.total

dat.join = dat.total %>% 
  full_join(dat.bf, by = c("site", "timing", "transect", "distance")) %>% 
  full_join(dat.ara, by = c("site", "timing", "transect", "distance")) %>% 
  full_join(dat.col, by = c("site", "timing", "transect", "distance")) %>% 
  full_join(dat.hem, by = c("site", "timing", "transect", "distance")) %>% 
  full_join(dat.ort, by = c("site", "timing", "transect", "distance"))

##### Replace NA in bf and family richness columns with 0
dat.join <- dat.join %>% 
  mutate(ct.sum.bf = ifelse(is.na(ct.sum.bf), 0, ct.sum.bf),
         bio.sum.bf = ifelse(is.na(bio.sum.bf), 0, bio.sum.bf),
         fam.ara = ifelse(is.na(fam.ara), 0, fam.ara),
         fam.col = ifelse(is.na(fam.col), 0, fam.col),
         fam.hem = ifelse(is.na(fam.hem), 0, fam.hem),
         fam.ort = ifelse(is.na(fam.ort), 0, fam.ort))
         

#################################################################################
#################################################################################

######################### Sample sizes ##################################

# New column transect.type: primary or supplementary transect
dat.join = dat.join %>% 
  mutate(transect.type = ifelse(transect == "T", "Supplementary", "Primary"))

# Number of samples on primary and supplementary transects by site type & timing
dat.summary = dat.join %>% 
  group_by(type, timing, transect.type) %>% 
  tally()


######################## Exploration histogram ##########################

# Subset to primary transects only
dat.sub.primary <- dat.join %>% 
  filter(transect.type == 'Primary')

# Family richness: Araneae
p.fam.ara = ggplot(dat.sub.primary, aes(x=fam.ara)) + geom_histogram() + 
  geom_vline(aes(xintercept=mean(fam.ara)), color="blue", linetype="dashed", 
             linewidth=1) + ggtitle("Family richness: Araneae")

p.fam.ara 


########################## Model Step 1 ######################################


# Subset data: primary transects, pre-spraying
dat.step1 <- dat.join %>% 
  filter(transect.type == "Primary" & timing == "P")

# Sample size
samplesize.step1 = dat.step1 %>% 
  group_by(type, timing) %>% 
  tally()

##### Model

mod.step1 = glmmPQL(fam.ara ~ cc.forb.p + mhl + sp_total + gc.lit.p + year, 
                    random = (~ 1 | site/transect), family = poisson, data = dat.step1)


##### Parameter confidence intervals #####

# Summary
step1.sum = summary(mod.step1)

# Table with global model covariate confidence intervals
step1.int = intervals(mod.step1, which = "fixed") # default level = 0.95
step1.inttab = data.frame(step1.int$fixed) %>% 
  rownames_to_column(var = "covariate") %>% 
  mutate(incl0 = 0 >= lower & 0 <= upper)

# Table with global model p-values
step1.pvals = data.frame(step1.sum$tTable)  %>% 
  rownames_to_column(var = "covariate") %>% 
  dplyr::select(covariate, p.value)

# Join p-values to inttab: new table "ci.table"
step1.ci.table = step1.inttab %>% 
  left_join(step1.pvals, by = "covariate") %>% 
  rename(est = est., ci.lower = lower, ci.upper = upper) %>% # rename columns (newname = oldname)
  dplyr::select(covariate, est, ci.lower, ci.upper, incl0, p.value) # reorder columns

### Add R2 values to ci.table

# Notes: From the help page for this package (MuMIn): 

#"The delta method can be used with for 
# all distributions and link functions, while lognormal approximation and trigamma 
# function are limited to distributions with logarithmic link. Trigamma-estimate 
# is recommended whenever available."

# "As of MuMIn version 1.41.0, 
# r.squaredGLMM returns a revised statistics based on Nakagawa et al. (2017) paper. 
# The returned value's format also has changed (it is a matrix rather than a
# numeric vector as before). Pre-1.41.0 version of the function calculated the 
# “theoretical” R2GLMM for binomial models." The version used in this analysis is 
# 1.47.5, and the output included in the manuscript is the revised statistic.

r.squaredGLMM(mod.step1)

step1.ci.table = step1.ci.table %>% 
  mutate(marg_r2_trigamma = r.squaredGLMM(mod.step1)[3],
         cond_r2_trigamma = r.squaredGLMM(mod.step1)[6])


############################### Model Step 2 #################################

# Subset data: primary transects, 0 & 25 m
dat.step2 = dat.join %>% 
  filter(transect.type == "Primary") %>%
  filter (distance == 0 | distance == 25) %>% 
  arrange(factor(timing, levels = c("P", "3", "20")))

# Re-level so pre-spray (timing = P) samples are first
dat.step2 = within(dat.step2, timing <- relevel(timing, ref = "P"))

# Sample size
samplesize.step2 = dat.step2 %>% 
  group_by(type, timing) %>% 
  tally()

##### Model step 2

mod.step2 = glmmPQL(fam.ara ~ mhl + type*timing, 
                    random = (~ 1 | site/transect), family = poisson, data = dat.step2)

##### Parameter confidence intervals #####

# Summary
step2.sum = summary(mod.step2)

# Table with global model covariate confidence intervals
step2.int = intervals(mod.step2, which = "fixed") # default level = 0.95
step2.inttab = data.frame(step2.int$fixed) %>% 
  rownames_to_column(var = "covariate") %>% 
  mutate(incl0 = 0 >= lower & 0 <= upper)

# Table with global model p-values
step2.pvals = data.frame(step2.sum$tTable)  %>% 
  rownames_to_column(var = "covariate") %>% 
  dplyr::select(covariate, p.value)

# Join p-values to inttab: new table "ci.table"
step2.ci.table = step2.inttab %>% 
  left_join(step2.pvals, by = "covariate") %>% 
  rename(est = est., ci.lower = lower, ci.upper = upper) %>% # rename columns (newname = oldname)
  dplyr::select(covariate, est, ci.lower, ci.upper, incl0, p.value) # reorder columns

# Add R2 values to ci.table
# Note: From the help page for this package (MuMIn): "As of MuMIn version 1.41.0, 
# r.squaredGLMM returns a revised statistics based on Nakagawa et al. (2017) paper. 
# The returned value's format also has changed (it is a matrix rather than a
# numeric vector as before). Pre-1.41.0 version of the function calculated the 
# “theoretical” R2GLMM for binomial models." The version used in this analysis is 
# 1.47.5, and the output included in the manuscript is the revised statistic.
r.squaredGLMM(mod.step2)

step2.ci.table = step2.ci.table %>% 
  mutate(marg_r2_trigamma = r.squaredGLMM(mod.step2)[3],
         cond_r2_trigamma = r.squaredGLMM(mod.step2)[6])

# Reorder rows
step2.ci.table = step2.ci.table %>% 
  arrange(factor(covariate, levels = c('(Intercept)', 'mhl', 'typeTreatment', 
                                       'timing3', 'timing20', 'typeTreatment:timing3', 
                                       'typeTreatment:timing20')))

####################### Diagnostic plots #####################################

##### Step 1 #####

qqnorm(mod.step1, main = 'Model step 1')
plot(mod.step1, main = 'Model step 1')


##### Step 2 #####

qqnorm(mod.step2, main = 'Model step 2')
plot(mod.step2, main = 'Model step 2')

########################## Export csv ###################################

# Datestamp
datestamp=as.Date(Sys.Date())
datestamp=format(datestamp, format="%Y%m%d")


# Step 1 table with covariate estimates, confidence intervals, p-values, and r2 values
write.csv(step1.ci.table, file = paste0("output/", datestamp, "_Ch2_models_fam_ara_poisson_confint_step1.csv"),row.names = FALSE)

# Step 2 table with covariate estimates, confidence intervals, p-values, and r2 values
write.csv(step2.ci.table, file = paste0("output/", datestamp, "_Ch2_models_fam_ara_poisson_confint_step2.csv"),row.names = FALSE)