#This file runs the spOccupancy base model found in our manuscript. 

#Load packages needed. Not all may be strictly necessary.
library(spOccupancy)
library(coda)
library(stars)
library(ggplot2)
library(tidyverse)
library(rjags) 
library(runjags) 
library(sf)
library(Matrix)

#Load our final, processed SSW data.
ssw_data = readRDS("ssw_data_final")
coords = st_coordinates(ssw_data) #Pull from the geometry data the X and Y coordinate data.

ssw_data$X=coords[,1] #Store these data as separate columns--see below.
ssw_data$Y=coords[,2]

#To set appropriate informative priors, we need to do a little spatial exploring--how close are the closest points at any given lake? How far apart are the further-apart points at any given lake?

min.dist = vector() #Storage objects.
max.dist = vector()

for(l in 1:length(unique(ssw_data$lake))) { #For each lake...
  
  #Filter to this lake's data.
  tmp1 = filter(ssw_data, ssw_data$lake == unique(ssw_data$lake)[l]) %>% distinct(geometry)
  
  tmp.dists = dist(st_coordinates(tmp1)) #Get a matrix of the distances between every two points.
  
  #Calculate the min distance
  min.dist = c(min(tmp.dists, na.rm = T), min.dist)
  
  #Calculate the max distance
  max.dist = c(max(tmp.dists, na.rm=T), max.dist)
}

#The largest minimum grid spacing in a lake is...
max.mindist = max(min.dist)

#The largest distance between two points at any lake is...
max.maxdist = max(max.dist)

#Now, remove the geometry data and arrange the data in order by point id and survey timing.
ssw_data = ssw_data %>% 
  st_drop_geometry() %>% 
  dplyr::arrange(pt.id, survey_year, survey_dateStart)

#Fix depth data to be in meters instead of feet.
ssw_data = ssw_data %>% mutate(depth_m = depth_ft / 3.281)


# Getting y and det.covs ---------------------------------------------------------
#The first step of organizing the data to be inserted into the spOccupancy functions is to rearrange our SSW presence/absence data into the structure the app needs, which is sites (pt.ids) x primary time periods (years) x replicates (the surveys at the same site in the same year, as applicable). Since the det.covs (detection covariates) need to be in the exact same 3D format, it makes sense to create those at the same time.

#Get the dimensions for our eventual arrays programmatically:
num_pts = max(ssw_data$pt.id)
num_years = max(ssw_data$survey_year - min(ssw_data$survey_year) + 1)
num_reps = ssw_data %>% 
  group_by(pt.id, survey_year) %>% 
  summarize(count = n())
num_reps = max(num_reps$count)

#Form empty arrays as storage objects.
y = array(NA, dim=c(num_pts, num_years, num_reps))
plant_density = array(NA, dim=c(num_pts, num_years, num_reps))
chara_density = array(NA, dim=c(num_pts, num_years, num_reps))
day_of_year = array(NA, dim=c(num_pts, num_years, num_reps))

#We need to go point id by point id and find which years each point was sampled and input those into the 2nd dimension. We then need to go into each individual year and find the replicate observations (if any) and put those in the third dimension.
for(p in 1:num_pts) {

  #Filter to just this point id's observations
  tmp.dat1 = ssw_data %>%
    filter(pt.id == p)

  #Next, figure out how many years of data we have and which years.
  years2for = unique(tmp.dat1$survey_year)

  #Next, loop over the number of years we have for this point.
  for(yr in 1:length(years2for)) {

    #Filter to just this current year's observations at this point
    tmp.dat2 = tmp.dat1 %>%
      filter(survey_year == years2for[yr])

    #Then, figure out how many surveys worth of info we have at this point in this year.

    surveys2for = unique(tmp.dat2$survey_dateStart)

    #Next, loop over the number of surveys we have.

    for(s in 1:length(surveys2for)) {

      #Filter to just this survey date's data
      tmp.dat3 = tmp.dat2 %>%
        filter(survey_dateStart == surveys2for[s])

      #Figure out which year this one is for indexing purposes
      this_year = tmp.dat3$survey_year - min(ssw_data$survey_year) + 1

      #Now, fill in the arrays according to the current point id = p value as x, the current year = this_year value as y, and the current survey day = s value as z.
      y[p, this_year, s] = tmp.dat3$SSW
      plant_density[p, this_year, s] = tmp.dat3$Plant.Density
      chara_density[p, this_year, s] = tmp.dat3$Characeae
      day_of_year[p, this_year, s] = tmp.dat3$days0

    }
  }
  if(p %% 100 == 0) { print(p) } #Progress bar
}

#Now we for the det.covs object.
det.covs = list(
  plant_density = plant_density,
  chara_density = chara_density,
  day_of_year = day_of_year
)

#Write these to file for time savings--Not run.
#saveRDS(det.covs, "det_covs")
#det.covs = readRDS("det_covs")

#saveRDS(y, "y_mod")
#y = readRDS("y_mod")

# Getting occ.covs --------------------------------------------------------

#Next would be to create the occ.covs (occupancy covariates) list object. This needs to be just 1D or 2D for each covariate. 

#Form our empty storage matrices
depth = array(NA, dim=c(num_pts,1))
access_numPUB = array(NA, dim=c(num_pts,1))
access_numPRIV = array(NA, dim=c(num_pts,1))
fetch = array(NA, dim=c(num_pts,1))
dist2access = array(NA, dim=c(num_pts,1))
inf_year = array(NA, dim=c(num_pts, num_years))
access_type = array(NA, dim=c(num_pts,1))
lake = array(NA, dim=c(num_pts,1))

# #We need to go point by point and find which years each site was sampled in and input those values into the 2nd dimension.
 for(p in 1:num_pts) {

   #Cut to just this point id's observations
   tmp.dat1 = ssw_data %>%
     filter(pt.id == p)

   #We're going to take a median of most variables across years to get something that varies by site but not by year. Only one variable will be allowed to additionally vary by year, which is inf.year (year since first known infestation), since it must vary by year by definition. Only depth varies much between repeated samples--all other covariates only vary very modestly as a result of slight variations in sample location between different surveys. 
   depth[p] = median(tmp.dat1$depth_m)
   access_numPUB[p] = median(tmp.dat1$access.numPUB)
   access_numPRIV[p] = median(tmp.dat1$access.numPRIV)
   fetch[p] = median(tmp.dat1$max.fetch)
   dist2access[p] = median(tmp.dat1$min.dist)
   access_type[p] = first(tmp.dat1$type)
   lake[p] = as.numeric(as.factor(first(tmp.dat1$lake)))

   #Next, figure out how many years of data we have and which years.
   years2for = unique(tmp.dat1$survey_year)

   #Next, loop over the number of years we have for this point id. 
   for(yr in 1:length(years2for)) {

     #Filter to just this current year's observations at this point
    tmp.dat2 = tmp.dat1 %>%
       filter(survey_year == years2for[yr])

     #If there are multiple surveys from the same survey year, we need to collapse those data down to just one record here when making this covariate.  
     if(nrow(tmp.dat2) > 1) {
       tmp.dat3 = tmp.dat2 %>%
       group_by(pt.id, survey_year) %>%
       summarize(inf_year = mean(inf.year),
                 .groups = "drop")
     } else {
        tmp.dat3 = tmp.dat2 %>%
           rename(inf_year = inf.year)
       }

     #Figure out which year this one was for indexing purposes
     this_year = tmp.dat3$survey_year - min(ssw_data$survey_year) + 1

       #Now, fill in the matrices with the appropriate data according to the current p value as x and the current this_year value as y. Only relevant for inf_year as it is the only variable varying by site and year. 

     inf_year[p, this_year] = tmp.dat3$inf_year

   }
   if(p %% 100 == 0) { print(p) } #Progress bar
 }

#We now need to create the depthxfetch covariate manually by multiplying our two arrays by each other above. This is because they need to be individually scaled and then multiplied. 
depthxfetch = scale(depth) * scale(fetch)

 #spOccupancy does not allow covariate data to be NA/latent, so we need to fill in all the NAs in inf_year for years we didn't actually make any observations. 

 for(row in 1:nrow(inf_year)) { #Go row by row, column by column...
   for(col in 1:ncol(inf_year)) {

     #We only need to act if the current cell is NA:
     if(is.na(inf_year[row,col])) {

       #Find the first non-NA value in this row and its position.
       first.val.spot = min(which(!is.na(inf_year[row,])))
       first.val = inf_year[row, first.val.spot]

       #The current value needs to be the first.val minus 1 for each intervening cell between them...
      current.val = first.val - (first.val.spot - col)

       #We fill in all the intervening values too using some carefully aligned sequences.
       inf_year[row, col:first.val.spot] = current.val:first.val
     }
   }
 }

 #We need to fix up access_type so that it's successfully numeric. Note the coding of Private as 1, which is backwards from what the default would otherwise be. 
 access_type[access_type == "PRIVATE"] = 1
 access_type[access_type == "PUBLIC"] = 0
 access_type = as.numeric(access_type)

 #Pack up all the occupancy data into one list.
 occ.covs =  list(
   depth = depth,
   access_numPUB = access_numPUB,
   access_numPRIV = access_numPRIV,
   fetch = fetch,
   dist2access = dist2access,
   inf_year = inf_year,
   depthxfetch = depthxfetch,
   access_type = access_type,
   lake = as.numeric(lake)
   )

 #Saving objects for time-savings. Not run.
#saveRDS(occ.covs, "occ_covs")
#occ.covs = readRDS("occ_covs")

#Now we need coordinates for our sites to model spatial autocorrelation within the spOccupancy functions.
#First, need to collapse down the data set to a single set of coordinate data per site, arranged numerically. 
condensed_dat = ssw_data %>% 
  group_by(pt.id) %>% 
  summarize(X = mean(X),
            Y = mean(Y)) %>% 
  arrange(pt.id)

#Then, create an empty matrix for the coordinates.
coords = matrix(NA, 
                nrow=nrow(condensed_dat),
                ncol=2)
#Fill it in, as appropriate.
coords[,1] = condensed_dat$X
coords[,2] = condensed_dat$Y
#We'll pass this coords object in as an input to the model functions. 

#Next we need to specify our model formulae. Note that scale is needed here for all variables not already scaled and not binary. 
ssw.occ.formula = ~ scale(depth) + scale(fetch) + scale(dist2access) +
  access_type + scale(access_numPUB) + scale(access_numPRIV) + 
  depthxfetch + scale(inf_year) + (1|lake)
ssw.det.formula = ~ scale(plant_density) + scale(day_of_year) + scale(chara_density)

#Next, need to pack all the raw data into a single list object with the appropriate tags.
ssw.mod.dat = list(
  y = y,
  occ.covs = occ.covs,
  det.covs = det.covs,
  coords = coords
)

#Next, establishing initials. These provided below are the defaults suggested by the package documentation, but we set them explicitly.
ssw.inits = list(
  alpha = c(0,0,0,0), #Detection coefficients including the intercept.
  beta = c(0,0,0,0,0,0,0,0,0), #Occupancy coefficients including the intercept.
  z = apply(y, c(1, 2), function(a) { as.numeric(sum(a, na.rm = TRUE) > 0) }), #Latent occupancies, ensuring that the initial is 1 always for sites that were found to be occupied at least once within a primary time period, otherwise bad things would happen.
  sigma.sq = 2, #Set an initial variance of 2. 
  phi = mean(c(3/(max.maxdist/2), 1/(max.mindist))), #Pick a value halfway in between the two bounds we gave as priors--see the manuscript text and below.
  w = rep(0, nrow(y)), #Set the initial spatial autocorrelation values all to 0s. 
  sigma.sq.psi = 2 #Set the initial variance (I think this is for the lake-level random effect) to 2 as well. 
)

#We'll specify priors next, using the defaults discussed in the package documentation but stated explicitly. We'll also use the "shorthand" form that allows us to specify them in groups rather than individually.
ssw.priors = list(alpha.normal = list( #Priors for coefficients in the detection model (including intercept)
                    mean = 0, var = 2.72 #These could be vectors but we're shorthanding here.
                  ),
                  beta.normal = list( #Priors for the coefficients in the occupancy model (including intercept)
                    mean = 0, var = 2.72
                  ),
                  phi.unif = c(3/(max.maxdist/2), 1/(max.mindist)), #See the manuscript text--we're essentially trying to confine the spatial autocorrelation piece to operate only at distances that are plausible within a single lake. 
                  sigma.sq.unif = c(0.001, 10), #To avoid too much confounding between this term and the occupancy intercept, we constrain to an upper bound of 10 here. 
                  sigma.sq.psi.ig = list(a = 0.1, b = 0.1) #This is an option for the prior for this term discussed in the documentation. 
                )

#Now, we specify all other defaults the model well need. 
batch.length = 25 #We need to break our sampling up into batches of a certain size. This is the size the package documentation recommends.
n.batch = 10000 #The total number of samples will be batch length * n batches. 
n.thin = 50 #Thinning rate--one out of every X consecutive samples will be kept, adhering to the batch length.
n.chains = 3 #They run sequentially, not in parallel.
n.neighbors = 15 #15 is the default. 

#Now we run the model, saving the result to an object. Keep in mind that this will take ~2 days to run on a PC of similar specs to the one we used. It may take double that on a weaker PC. The output object is also ~20-40 GB in size, so computers with less available memory than that may crash trying to run this!
out.mod1 = stPGOcc(ssw.occ.formula, 
        ssw.det.formula,
        ssw.mod.dat,
        inits = ssw.inits,
        n.batch = n.batch,
        batch.length = batch.length,
        priors = ssw.priors,
        cov.model = "exponential", #The default, but there are three additional options
        n.neighbors = n.neighbors,
        n.thin = n.thin,
        n.chains = n.chains,
        n.report = 100, #A progress bar--optional.
        n.omp.threads = 4, #This allows you to speed up computation for each individual chain, assuming you have the threads available.
        ar1 = FALSE) #No temporal autocorrelation included.


## Post model Analytics:
summary(out.mod1) #As seen in the modsummary.txt files. 

#Assess whether effective sample size for the spatial random effects + occupancy intercept are sufficiently high, since these two terms are necessarily confounded to a degree.
beta.0.w.samples <- out.mod1$beta.samples[, 1] + out.mod1$w.samples
ess.beta.0.w <- coda::effectiveSize(beta.0.w.samples)
summary(ess.beta.0.w)

#Output saving--not run.
#Save the occupancy parameter posterior samples to file.
# beta.samples <- out.mod1$beta.samples
# saveRDS(beta.samples, file = 'beta-samples.rds')
# 
# #Save the detection  parameter posterior samples to file.
# alpha.samples <- out.mod1$alpha.samples
# saveRDS(alpha.samples, file = 'alpha-samples.rds')
# 
# #These are the presence-absence posteriors for all points/time-periods
# z.samples <- out.mod1$z.samples
# saveRDS(z.samples, file = 'z-samples.rds')
# 
# #These are the lake-level random effects
# beta.star.samples <- out.mod1$beta.star.samples
# saveRDS(theta.samples, file = 'theta.samples-samples.rds')
# 
# #I forget which samples these are--I believe they're related to the spatial random effects.
# theta.samples <- out.mod1$theta.samples
# saveRDS(theta.samples, file = 'theta-samples.rds')
# 
# #These are the variance of the lake level random effects
# sigma.sq.psi.samples <- out.mod1$sigma.sq.psi.samples
# saveRDS(sigma.sq.psi.samples, file = 'sigma.sq.psi.samples.rds')
# 
# #These are the psis for every same point, time period, and posterior sample (a big object!)
# psi.samples <- out.mod1$psi.samples
# saveRDS(psi.samples, file = 'psi.samples.rds')

#Posterior predictive checks--as recommended by the package documentation. These each take several hours to perform. 
ppc.out1 <- ppcOcc(out.mod1, fit.stat = 'freeman-tukey', group = 1)
#saveRDS(ppc.out1, "PPC_tukey_grp1.rds")

ppc.out2 <- ppcOcc(out.mod1, fit.stat = 'freeman-tukey', group = 2)
#saveRDS(ppc.out2, "PPC_tukey_grp2.rds")

ppc.out3 <- ppcOcc(out.mod1, fit.stat = 'chi-squared', group = 1)
#saveRDS(ppc.out3, "PPC_chi_grp1.rds")

ppc.out4 <- ppcOcc(out.mod1, fit.stat = 'chi-squared', group = 2)
#saveRDS(ppc.out4, "PPC_chi_grp2.rds")

summary(ppc.out1); summary(ppc.out2); summary(ppc.out3); summary(ppc.out4)