## Spring two-row GBS marker data processing
## 
## This script will take GBS data for the spring two-row (S2) barley population,
## perform filtration and imputation, and output processed files.
## 
## Author: Jeff Neyhart
## Last Modified: 24 July 2019
## 

# Libraries
library(tidyverse)


## Functions
# Function to measure marker/line missingness
measure.missing <- function(x, type = c("numeric", "nucleotide")) {
  
  if (type == "numeric") {
    return( sum(is.na(x)) / length(x) )
  }
  
  if (type == "nucleotide") {
    return( sum(x == "NN") / length(x) )
  }
}

# Function to measure marker/line heterozygosity
measure.het <- function(x, type = "numeric") {
  
  if (type == "numeric") {
    snp.recode <- na.omit(x)
    return( sum(snp.recode == 0) / length(snp.recode) )
  }
  
  if (type == "nucleotide") {
    snp.recode <- x[x != "NN"]
    n.het <- sum(sapply(X = strsplit(snp.recode, ""), FUN = function(site) length(unique(site)) == 2))
    return( n.het / length(snp.recode) )
  }
}




## Set directories
## 
## These need to be adjusted by the user prior to running the script
## 

# Location of hmp file that was converted from the VCF file
geno_dir <- "path/to/folder/containing/raw/hapmap"
# Location to store the processed hapmap files
proc_geno_dir <- "path/to/folder/storing/processed/hapmap"

# Location of entry data with pedigree information
entry_dir <- "path/to/folder/containing/pedigee"



# Read in the entry data
# This CSV should be placed in the `entry_dir` directory (see above).
entry_dat <- read_csv(file.path(entry_dir, "UMN_S2_pedigree.csv"))

# Create character vectors with the S2TP lines, S2C1 lines, and S2C1.5 lines
s2tp <- subset(entry_dat, population == "S2TP", line_name, drop = T)
s2c1 <- subset(entry_dat, population == "S2C1", line_name, drop = T)
s2c15 <- subset(entry_dat, population == "S2C1.5", line_name, drop = T)

## Read in the HapMap file with all samples and variants
# This file should be saved in the `geno_dir` directory (see above).
gbs_hmp <- read_tsv(file = file.path(geno_dir, "2row_GBS_final_snps_newpos_renamed_hmp.txt")) %>%
  mutate_if(is.logical, as.numeric)







## Prepare data for each specific population

### S2TP

# Subset the larger hapmap for S2TP lines (and duplicates)
s2tp_gbs_hmp <- gbs_hmp %>%
  select(1:4, which(str_detect(names(.), "^[0-9]{2}")))


# In this dataset, several entries in the training population were genotyped two
# or three times. Since these lines are highly inbred and homogenous, it is useful
# to combine the data across these replicates.
# 
# The procedure will be: 1. Find the duplicate with the lowest missing
# data and keep that one or 2. Combine the marker data in duplicates


# First find duplicates
dup_entries <- names(s2tp_gbs_hmp) %>% 
  str_replace_all(string = ., pattern = "\\.[0-9]*$", "") %>% 
  {table(.) > 1} %>% 
  which() %>% 
  names()

# Define a function to take a genotype matrix and perform site-wise tests of agreement
# Returns the proportion of agreement among non-missing sites
identical_sites <- function(genos) {
  site.compare <- apply(X = genos, MARGIN = 1, FUN = function(site) {
    if (any(is.na(site))) {
      return(NA)
    } else {
      if (length(unique(as.character(site))) == 1) {
        return(1)
      } else {
        return(0)
      }} })
  
  sum(site.compare, na.rm = T) / sum(!is.na(site.compare))
}


# Iterate over duplicates and find the number of non-missing site pairs that are the same
dup_comparison <- lapply(X = dup_entries, FUN = function(ent) {
  
  ent_genos <- s2tp_gbs_hmp %>% 
    select(matches(ent))
  
  # Perform an agreement test on all duplicates
  prop_sites_identical <- identical_sites(ent_genos)
  
  # If there are more than 2 duplicates, also perform pairwise concurrence tests
  if (ncol(ent_genos) > 2) {
    comb_mat <- combn(colnames(ent_genos), 2) %>%
      t() %>%
      as.data.frame() %>%
      rename_all(~c("entry1", "entry2"))
    
    # Iterate over the combinations
    pairwise_conc <- apply(X = comb_mat, MARGIN = 1, FUN = function(combo)
      identical_sites(genos = ent_genos[,combo]) )
    
    comb_mat <- comb_mat %>%
      mutate(pairwise_conc = pairwise_conc)
    
    
  } else {
    comb_mat <- NA
  }
  
  return(list(prop_sites_identical = prop_sites_identical, comb_mat = comb_mat))
})


names(dup_comparison) <- dup_entries



# The result is a list of comparisons only for those lines that are duplicated. If
# we look at the third duplicate, we see that more than 99% of sites are identical
# between the duplicates.

dup_comparison[[3]]


# There are others where two duplicates are present (i.e. the entry was genotyped
# three times). Sometimes, one of the three is an outlier, so the pairwise comparisons
# of site agreement can be thrown off. Here is an instance where the entry `08N2-77`
# was probably poorly genotyped

dup_comparison$`08N2-77`


# Other times there is high agreement among all three duplicates

dup_comparison$`06N2-15`


# If we look at all of the comparisons, we can see that most have very high agreement.


hist(sapply(X = dup_comparison, FUN = function(comp) comp[[1]]))



# Now we will go through and merge the duplicates that are in high agreement. The 
# procedure for doing this is as follows: 1. Determine among the duplicates which
# has the lowest heterozygosity. Entries with high heterozygosity are likely errors,
# since the lines were inbred to at least the F_6 generation, 2. Iterate over the
# duplicates and apply these steps: a. if the proportion of agreement is >= 0.95,
# iterate over sites, b. if sites among all duplicates are missing, output missing, 
# c. if one duplicate is missing, but the other is not, output the non-missing call, 
# d. if both duplicates are non-missing and agree, output that call, e. if both 
# duplicates are non-missing but do not agree, prioritize the duplicate with the 
# lowest heterozygosity, f. if the agreement proportion is not >= 0.95, output all
# of the genotype information from the duplicate with the lowest heterozygosity.


# Set a comparison cutoff for merging
merge_cutoff = 0.95

# Let's see how many of the comparisons meet that cutoff
sum(map_dbl(dup_comparison, "prop_sites_identical") >= merge_cutoff)


# Create merged genotype data from duplicates
merged_genos <- lapply(X = dup_entries, FUN = function(ent) {
  # Pull out the comparison data
  ent_comparison <- dup_comparison[[ent]]
  
  # Pull out the genotype data
  ent_genos <- s2tp_gbs_hmp %>% 
    select(matches(ent))
  
  # Determine which entry has the lowest heterozygosity
  # Measure missingness (as number of missing) and hets (as number of hets)
  ent_stats <- apply(X = ent_genos, MARGIN = 2, FUN = function(entry) {
    missing <- measure.missing(entry)
    het <- measure.het(entry)
    cbind(missing = missing, het = het) })
  
  # Measure heterozygosity
  # Proportion of hets per non-missing site
  least_het <- which.min(ent_stats[2,])
  
  # If the comb_mat is empty, look for the comparison across all duplicates
  if (all(is.na(ent_comparison$comb_mat))) {
    
    # If the level of concordance is >= 0.95, then merge
    if (ent_comparison$prop_sites_identical >= merge_cutoff) {
      
      apply(X = ent_genos, MARGIN = 1, FUN = function(site) {
        # If both are missing, output missing
        if (all(is.na(site))) {
          return(NA)
        } else {
          # If one site over the entries is non-missing, assign that site
          if (sum(!is.na(site)) == 1) {
            return(site[!is.na(site)])
          } else {
            
            # The only other option is that both sites are non-missing
            # If the sites are the same, output that call
            if ((length(unique(as.character(site))) == 1)) {
              return(site[1])
              # If not return the call from the entry with the least het
            } else {
              return(site[least_het])
            }}}})
      
      # If the concordance is less than 0.95, pick the entry will the least het
    } else {
      return(ent_genos[,least_het])
    }
    
    
    
    # The other scenario is that the entry was duplicated > 1 (i.e. 3 or more)
  } else {
    
    # If all comparisons are greater than 0.95, merge together
    if (all(ent_comparison$comb_mat[,"pairwise_conc"] >= merge_cutoff)) {
      
      apply(X = ent_genos, MARGIN = 1, FUN = function(site) {
        # If both are missing, output missing
        if (all(is.na(site))) {
          return(NA)
        } else {
          # If one site over the entries is non-missing, assign that site
          if (sum(!is.na(site)) == 1) {
            return(site[!is.na(site)])
          } else {
            
            # The only other option is that both sites are non-missing
            # If the sites are the same, output that call
            if ((length(unique(as.character(site))) == 1)) {
              return(site[1])
              # If not return the call from the entry with the least het
            } else {
              return(site[least_het])
            }}}})
      
      # If not all of the concordance comparisons are >= the merge cutoff, find 
      ## comparisons with the greatest concordance above the cutoff
      
    } else {
      good_comparison <- subset(ent_comparison$comb_mat, pairwise_conc >= merge_cutoff)
      
      # Find the good entries
      good_ent <- as.character(unlist(select(good_comparison, -pairwise_conc), use.names = FALSE))
      # Subset the ent_genos matrix
      good_ent_genos <- ent_genos[,good_ent]
      
      ent_stats <- apply(X = good_ent_genos, MARGIN = 2, FUN = function(entry) {
        missing <- measure.missing(entry)
        het <- measure.het(entry)
        cbind(missing, het) })
      
      # Measure heterozygosity
      # Proportion of hets per non-missing site
      least_het <- which.min(ent_stats[2,])
      
      # Perform the regular merging procedure
      apply(X = good_ent_genos, MARGIN = 1, FUN = function(site) {
        # If both are missing, output missing
        if (all(is.na(site))) {
          return(NA)
        } else {
          # If one site over the entries is non-missing, assign that site
          if (sum(!is.na(site)) == 1) {
            return(site[!is.na(site)])
          } else {
            
            # The only other option is that both sites are non-missing
            # If the sites are the same, output that call
            if ((length(unique(as.character(site))) == 1)) {
              return(site[1])
              # If not return the call from the entry with the least het
            } else {
              return(site[least_het])
            }}}})
    }}}); names(merged_genos) <- dup_entries


# Create a matrix
merged_dups <- do.call("cbind", merged_genos)
# Rename
colnames(merged_dups) <- str_remove(colnames(merged_dups), "\\.[0-9]*$")





# Now we can merge all the data together for the S2TP


# Combine the hapmap info, the genotypes from non-duplicated entries, and the 
## merged / unmerged genotypes from the duplicated entries
s2tp_gbs_hmp1 <- s2tp_gbs_hmp %>% 
  select(which(!str_detect(string = names(s2tp_gbs_hmp), pattern = str_c(dup_entries, collapse = "|")))) %>%
  bind_cols(merged_dups) %>%
  select(1:4, sort(names(.)))



# 
# The next step to prepare the data for imputation is to filter the marker dataset
# for extreme missingness and minor allele frequency.


# SNP missingness
snp_missingness <- s2tp_gbs_hmp1 %>% 
  select(-`rs#`:-pos) %>%
  is.na() %>%
  rowMeans()

hist(snp_missingness)





# Remove complete missing snps
s2tp_gbs_hmp2 <- s2tp_gbs_hmp1 %>% 
  filter(snp_missingness < 1)

# Remeasure missingness
snp_missingness1 <- s2tp_gbs_hmp2 %>% 
  select(-`rs#`:-pos) %>%
  is.na() %>%
  rowMeans()

hist(snp_missingness1)





# Measure maf
snp_maf <- s2tp_gbs_hmp2 %>% 
  select(-`rs#`:-pos) %>%
  {. + 1 } %>% # Add one to get the number of 1 alleles at each site
  rowMeans(na.rm = T) %>% # Divide by 2 to get the frequency
  {. / 2} %>%
  pmin(., 1 - .)

# SFS
hist(snp_maf)





# Measure SNP heterozygosity
snp_het <- s2tp_gbs_hmp2 %>% 
  select(-`rs#`:-pos) %>%
  apply(MARGIN = 1, FUN = function(snp) sum(snp == 0, na.rm = T) / length(snp))

hist(snp_het)

# This looks really good





# Establish cutoffs for missingness and MAF
# Filter SNPs on a heterzygostiy filter of 0.08, which is close to the expected heterozygosity
# of F_5 plants (0.0625)
to_keep <- which(snp_maf >= 0.05 & snp_missingness1 <= 0.80  & snp_het <= 0.08)

# Filter the dataset
s2tp_gbs_hmp3 <- s2tp_gbs_hmp2[to_keep,]






# Measure the entry missingness
entry_missing <- s2tp_gbs_hmp3 %>% 
  select(-`rs#`:-pos) %>% 
  is.na() %>% 
  colMeans()

hist(entry_missing)





# Measure entry heterzygosity
entry_het <- s2tp_gbs_hmp3 %>% 
  summarise_at(vars(-`rs#`:-pos), list(~sum(. == 0, na.rm = T) / length(.))) %>%
  as.numeric()

hist(entry_het)



# Filter entries for missingness
to_keep <- which(entry_missing <= 0.80)

# Get the entry list
line_names <- s2tp_gbs_hmp3 %>%
  select(-`rs#`:-pos) %>%
  names()

# Looks like we lost 8 entries
(lines_to_remove <- setdiff(line_names, line_names[to_keep]))

s2tp_gbs_hmp4 <- s2tp_gbs_hmp3 %>% 
  select(which(!names(.) %in% lines_to_remove))






# Remeasure SNP missingness, MAF
snp_maf1 <- s2tp_gbs_hmp4 %>% 
  select(-`rs#`:-pos) %>%
  {. + 1 } %>% # Add one to get the number of 1 alleles at each site
  rowMeans(na.rm = T) %>% # Divide by 2 to get the frequency
  {. / 2} %>%
  pmin(., 1 - .)

hist(snp_maf1)





snp_missingness2 <- s2tp_gbs_hmp4 %>% 
  select(-`rs#`:-pos) %>%
  is.na() %>%
  rowMeans()

hist(snp_missingness2)


# Save this file
# This file should be saved in the `proc_geno_dir` directory (see above).
write_tsv(x = s2tp_gbs_hmp4, path = file.path(proc_geno_dir, "s2tp_filtered_genos_hmp.txt"))



### S2C1

# The genotypes for the S2C1 are nicely structured: we have 40 families of 30
# individuals each. We need to perform some filtering before running imputation.


# Subset the larger hapmap for S2C1 lines
s2c1_gbs_hmp <- gbs_hmp %>%
  select(1:4, which(names(.) %in% s2c1))



# The next step to prepare the data for imputation is to filter the marker dataset
# for extreme missingness and minor allele frequency.


# SNP missingness
snp_missingness <- s2c1_gbs_hmp %>% 
  select(-`rs#`:-pos) %>%
  is.na() %>%
  rowMeans()

hist(snp_missingness)





# Remove complete missing snps
s2c1_gbs_hmp2 <- s2c1_gbs_hmp %>% 
  filter(snp_missingness < 1)

# Remeasure missingness
snp_missingness1 <- s2c1_gbs_hmp2 %>% 
  select(-`rs#`:-pos) %>%
  is.na() %>%
  rowMeans()

hist(snp_missingness1)





# Measure maf
snp_maf <- s2c1_gbs_hmp2 %>% 
  select(-`rs#`:-pos) %>%
  {. + 1 } %>% # Add one to get the number of 1 alleles at each site
  rowMeans(na.rm = T) %>% # Divide by 2 to get the frequency
  {. / 2} %>%
  pmin(., 1 - .)

# SFS
hist(snp_maf)





# Measure SNP heterozygosity
snp_het <- s2c1_gbs_hmp2 %>% 
  select(-`rs#`:-pos) %>%
  apply(MARGIN = 1, FUN = function(snp) sum(snp == 0, na.rm = T) / length(snp))

hist(snp_het)

# This looks really good





# Establish cutoffs for missingness and MAF
# Filter SNPs on a heterzygostiy filter of 0.35
to_keep <- which(snp_maf >= 0.02 & snp_missingness1 <= 0.80  & snp_het <= 0.35)

# Filter the dataset
s2c1_gbs_hmp3 <- s2c1_gbs_hmp2[to_keep,]






# Measure the entry missingness
entry_missing <- s2c1_gbs_hmp3 %>% 
  select(-`rs#`:-pos) %>% 
  is.na() %>% 
  colMeans()

hist(entry_missing)





# Measure entry heterzygosity
entry_het <- s2c1_gbs_hmp3 %>% 
  summarise_at(vars(-`rs#`:-pos), funs(sum(. == 0, na.rm = T) / length(.))) %>%
  as.numeric()

hist(entry_het)



# Filter entries for missingness and heterozygosity
to_keep <- which(entry_missing <= 0.80 & entry_het <= 0.35)

# Get the entry list
line_names <- s2c1_gbs_hmp3 %>%
  select(-`rs#`:-pos) %>%
  names()

# Looks like we lost 86 entries
(lines_to_remove <- setdiff(line_names, line_names[to_keep]))

s2c1_gbs_hmp4 <- s2c1_gbs_hmp3 %>% 
  select(which(!names(.) %in% lines_to_remove))






# Remeasure SNP missingness, MAF
snp_maf1 <- s2c1_gbs_hmp4 %>% 
  select(-`rs#`:-pos) %>%
  {. + 1 } %>% # Add one to get the number of 1 alleles at each site
  rowMeans(na.rm = T) %>% # Divide by 2 to get the frequency
  {. / 2} %>%
  pmin(., 1 - .)

hist(snp_maf1)






snp_missingness2 <- s2c1_gbs_hmp4 %>% 
  select(-`rs#`:-pos) %>%
  is.na() %>%
  rowMeans()

hist(snp_missingness2)





# Save this file
# This file should be saved in the `proc_geno_dir` directory (see above).
write_tsv(x = s2c1_gbs_hmp4, path = file.path(proc_geno_dir, "s2c1_filtered_genos_hmp.txt"))








### S2C1.5

# Subset the larger hapmap for S2C1.5 lines
s2c15_gbs_hmp <- gbs_hmp %>%
  select(1:4, which(names(.) %in% s2c15))



# Again we don't need to combine any data, so we can
# simply delve into quality measurements and filters



## SNP missingness

# First let's look at snp missingness
snp.missingness <- s2c15_gbs_hmp %>% 
  select(-`rs#`:-pos) %>%
  is.na() %>%
  rowMeans()

hist(snp.missingness)



## Remove completely missing SNPs and remeasure missingness

# Remove completely missing SNPs
s2c15_gbs_hmp1 <- s2c15_gbs_hmp %>%
  filter(snp.missingness < 1)

# Remeasure missingness, and also measure MAF
snp.missingness1 <- s2c15_gbs_hmp1 %>% 
  select(-`rs#`:-pos) %>%
  is.na() %>%
  rowMeans()

hist(snp.missingness1)



## Measure SNP MAF

# MAF
snp.maf <- s2c15_gbs_hmp1 %>% 
  select(-`rs#`:-pos) %>%
  + 1 %>% # Add one to get the number of 1 alleles at each site
  rowMeans(na.rm = T) %>% # Divide by 2 to get the frequency
  {. / 2} %>%
  sapply(FUN = function(freq) min(freq, 1 - freq))

hist(snp.maf)



## Measure SNP heterozygosity

snp.het <- s2c15_gbs_hmp1 %>%
  select(-`rs#`:-pos) %>%
  apply(MARGIN = 1, FUN = function(snp) sum(snp == 0, na.rm = T) / sum(!is.na(snp)))

hist(snp.het)



## Remove SNPs with high missigness, very low MAF, and very high het

snps.to.keep <- which(snp.maf >= 0.1 & snp.het <= 0.35 & snp.missingness1 <= 0.8)

# Keep the designated snps
s2c15_gbs_hmp2 <- s2c15_gbs_hmp1 %>%
  slice(snps.to.keep)

# Remeasure missingness
snp.missingness2 <- s2c15_gbs_hmp2 %>% 
  select(-`rs#`:-pos) %>%
  is.na() %>%
  rowMeans()

hist(snp.missingness2)


## Remeasure MAF
snp.maf1 <- s2c15_gbs_hmp2 %>% 
  select(-`rs#`:-pos) %>%
  + 1 %>% # Add one to get the number of 1 alleles at each site
  rowMeans(na.rm = T) %>% # Divide by 2 to get the frequency
  {. / 2} %>%
  sapply(FUN = function(freq) min(freq, 1 - freq))

hist(snp.maf1)


## Remeasure heterozygosity

snp.het1 <- s2c15_gbs_hmp2 %>%
  select(-`rs#`:-pos) %>%
  apply(MARGIN = 1, FUN = function(snp) sum(snp == 0, na.rm = T) / sum(!is.na(snp)))

hist(snp.het1)



## Measure entry missingness
entry.missing <- s2c15_gbs_hmp2 %>% 
  select(-`rs#`:-pos) %>% 
  summarize_each(funs = funs(mean(is.na(.))) ) %>%
  as.numeric()

hist(entry.missing)


## Measure entry heterozygosity
entry.het <- s2c15_gbs_hmp2 %>% 
  select(-`rs#`:-pos) %>%
  summarise_each(funs = funs(sum(. == 0, na.rm = T) / sum(!is.na(.)))) %>%
  as.numeric()

hist(entry.het)


## Filter entries for missingness and heterozygosity - then save
entry.to.remove <- which(entry.missing > 0.80 | entry.het > 0.50)

# Looks like we lost 10 entries
length(entry.to.remove)

# Pull out the lines
lines.to.remove <- s2c15_gbs_hmp2 %>%
  select(-`rs#`:-pos) %>%
  names() %>% 
  .[entry.to.remove]

# Filter
s2c15_gbs_hmp3 <- s2c15_gbs_hmp2 %>% 
  select(which(!names(.) %in% lines.to.remove))

# Save this file
# This file should be saved in the `proc_geno_dir` directory (see above).
write_tsv(x = s2c15_gbs_hmp3, path = file.path(proc_geno_dir, "s2c15_filtered_genos_hmp.txt"))







## Combine the filtered marker sets

# Read in the filtered hapmap datasets
# This file should be saved in the `proc_geno_dir` directory (see above).
s2tp_hmp <- read_tsv(file = file.path(proc_geno_dir, "s2tp_filtered_genos_hmp.txt"), progress = F)
s2c1_hmp <- read_tsv(file = file.path(proc_geno_dir, "s2c1_filtered_genos_hmp.txt"), progress = F)
s2c15_hmp <- read_tsv(file = file.path(proc_geno_dir, "s2c15_filtered_genos_hmp.txt"), progress = F)


# Combine on marker name
combined_hmp <- Reduce(full_join, list(s2tp_hmp, s2c1_hmp, s2c15_hmp))
# combined_hmp <- full_join(s2tp_hmp, s2c1_hmp)




## Sort 

# Sort on chromosome and position
combined_hmp1 <- combined_hmp %>% 
  arrange(chrom, pos) %>%
  mutate(chrom = replace(chrom, chrom == "UNK", 8))

# Save as one file
# This file should be saved in the `proc_geno_dir` directory (see above).
write_tsv(x = combined_hmp1, path = file.path(proc_geno_dir, "s2_filtered_genos_hmp.txt"))