## Genotype imputation
## 
## This script outlines the imputation steps for the spring two-row genotyping-by
## -sequencing experiments.
## 
## Author: Jeff Neyhart
## Last modified: 24 July 2019
## 


# Load some packages and set some directories
library(fsimpute) # Available from https://github.com/neyhartj/fsimpute
library(tidyverse)
library(qtl)
library(rrBLUP)

## Set directories
## 
## These need to be adjusted by the user prior to running the script
## 

# Location of the processed hapmap files from the `s2_marker_data_processing_use.R` script
proc_geno_dir <- "path/to/folder/storing/processed/hapmap"

# Location of physical/genetic interpolation data
map_dir <- "path/to/folder/containing/map/interpolation"
# Location of entry data with pedigree information
entry_dir <- "path/to/folder/containing/pedigee"
# Location of imputation data
imp_dir <- "path/to/folder/storing/imputed/genotype/data"

## Final data directory
final_dir <- "path/to/final/data/directory"



# Read in the entry data
# This CSV should be placed in the `entry_dir` directory (see above).
entry_dat <- read_csv(file.path(entry_dir, "UMN_S2_pedigree.csv"))

# Create character vectors with the S2TP lines, S2C1 lines, and S2C1.5 lines
s2tp <- subset(entry_dat, population == "S2TP", line_name, drop = T)
s2c1 <- subset(entry_dat, population == "S2C1", line_name, drop = T)
s2c15 <- subset(entry_dat, population == "S2C1.5", line_name, drop = T)


## Read in the filtered genotype data for the S2TP and S2C1
## These data files should be read in (the below command looks for both):
## s2tp_filtered_genos_hmp.txt
## s2c1_filtered_genos_hmp.txt
## 
## Note: these files are created by the `s2_marker_data_processing_use.R` script
## 
## 
geno_hmp <- list.files(proc_geno_dir, full.names = TRUE, pattern = "s2tp|s2c1_") %>%
  map(read_tsv) %>%
  Reduce(f = full_join, x = .) %>%
  arrange(chrom, pos)

# Read in the interpolated genetic map data
# This CSV should be the `map_dir` directory (see above)
snp_info <- read_csv(file = file.path(map_dir, "GBS_marker_info.csv"))
  


###  fsimpute

# We first start with `fsimpute` to impute individuals using the known pedigree structure

# Preparation


# Merge the new SNP data with the genotype data, removing pesky chromosome 8
geno_hmp2 <- inner_join(geno_hmp, snp_info) %>%
  select(1:4, cM_pos, names(.))

# Make a map
map <- geno_hmp2 %>% 
  select(1:5) %>%
  split(.$chrom) %>% 
  map(function(chr) structure(chr$cM_pos, names = chr$`rs#`, class = "A")) %>% 
  structure(class = "map")

# Jitter the map
map <- jittermap(map)

## C1 parents
c1_parents <- entry_dat %>%
  filter(population == "S2TP", parent) %>%
  pull(line_name)

all(c1_parents %in% names(geno_hmp2))

## All of them! Good


## Create a pedigree file for the S2C1
s2c1_pedigree <- entry_dat %>%
  filter(population == "S2C1") %>%
  separate(pedigree, c("parent1", "parent2"), sep = "/") %>%
  select(line_name, parent1, parent2) %>%
  mutate(family = str_extract(line_name, "3[0-9]{3}"))

## Execute imputation

fam_list <- list()

# Iterate over each family number
for (fam in unique(s2c1_pedigree$family)) {
  
  # Get the parents and the progeny
  parents <- s2c1_pedigree %>% 
    filter(family == fam) %>% 
    select(contains("parent")) %>% 
    distinct() %>% 
    unlist() %>%
    sort()
  
  progeny <- s2c1_pedigree %>% 
    filter(family == fam) %>% 
    distinct(line_name) %>%
    unlist() %>%
    sort()
  
  # Subset the genotype matrix
  fam_genos <- geno_hmp2 %>% 
    select(1:5, which(names(.) %in% c(parents, progeny)))
  
  # Separate the parent matrix and the progeny matrix and reformat
  founders_recode <- fam_genos %>% 
    # Split by chrosome
    split(.$chrom) %>%
    # Apply a function
    map(function(chr) {
      chr.recode <- chr %>%
        select(which(names(.) %in% parents)) %>% 
        t() %>%
        {. + 1}
      colnames(chr.recode) <- chr$`rs#`
      return(chr.recode) })
  
  finals_recode <- fam_genos %>% 
    # Split by chrosome
    split(.$chrom) %>%
    # Apply a function
    map(function(chr) {
      chr.recode <- chr %>%
        select(which(names(.) %in% progeny)) %>% 
        t() %>%
        {. + 1}
      colnames(chr.recode) <- chr$`rs#`
      return(chr.recode) })
  
  # Prep the cross
  fam_cross <- prep_cross(map = map, founders = founders_recode, 
                          finals = finals_recode, selfing.gen = 2)
  
  # Impute
  imputed_cross <- fsimpute(cross = fam_cross, prob.threshold = 0.7)
  
  # Reassemble the matrix
  fam_list[[fam]] <- list(
    founders_genos = lapply(X = imputed_cross$geno, FUN = function(chr) chr$founders_imputed) %>%
      do.call("cbind", .),
    finals_genos = lapply(X = imputed_cross$geno, FUN = function(chr) chr$finals_imputed) %>%
      do.call("cbind", .),
    missingness = imputed_cross$missingness_stats
  )
}



# Save the list
# This file should be saved in the `imp_dir` directory (see above)
save("fam_list", file = file.path(imp_dir, "fsimpute", "fsimpute_family_imputed.RData"))



#### Follow-Up

# Load the data
# These files should be located in the `imp_dir` directory (see above)
load(file.path(imp_dir, "fsimpute", "fsimpute_family_imputed.RData") )

# Check the stats
fam_missing <- lapply(X = names(fam_list), FUN = function(fam) {
  
  # Extract the data for that family
  fam_data <- fam_list[[fam]]$missingness
  
  # Add the family number as a variable
  fam_data %>% 
    mutate(family = fam) %>%
    select(family, group:post_imputation) }) %>%
  
  # Row bind
  bind_rows()

# Perform some analysis
(g <- fam_missing %>%
    group_by(family, group) %>%
    summarize(Before = mean(pre_imputation),
              After = mean(post_imputation)) %>%
    # Gather
    gather(treatment, value, -family, -group) %>%
    
    ## Plot!
    ggplot(aes(family, value)) +
    geom_bar(aes(fill = treatment), position = "dodge", stat = "identity") + 
    facet_wrap("group", nrow =  2) +
    xlab("Family") +
    ylab("Missing data proportion") +
    ylim(c(0, 1)) +
    theme(
      axis.text.x = element_text(angle = 90)
    ) )




# Here we will find consensus genotypes among the repeated parents and combine
# with non-repeated parents.


# Among repeated parents, check the concordance
repeated_parents <- s2c1_pedigree %>% 
  select(contains("parent")) %>% 
  distinct() %>% 
  unlist() %>% 
  table() %>%
  subset(., . > 1) %>% 
  names()



# Apply a function across parents
consensus_pars <- lapply(X = repeated_parents, FUN = function(par) {
  
  # Find the families of that parent
  par_fam <- s2c1_pedigree %>% 
    filter_at(vars(contains("parent")), any_vars(. == par)) %>%
    pull(family) %>% 
    unique()
  
  # Pull out the imputed data for those parents
  par_fam_impute <- sapply(X = par_fam, FUN = function(fam) {
    fam_list[[fam]]$founders_genos[par,] }) %>%
    t()
  
  ## Find the missingness per parent
  init_missing <- rowMeans(is.na(par_fam_impute))
    
  # Correlate
  init_cor <- cor(t(par_fam_impute), use = "complete.obs")
  
  # Iterate over markers to create the consensus genotype
  consensus <- apply(X = par_fam_impute, MARGIN = 2, FUN = function(snp) {
    
    # Table of genotypes
    tabl <- table(snp) 
    # If the length is 0 (all NA) return NA
    if (length(tabl) == 0) return(NA)
    
    # If there isn't a most frequent genotype, return NA
    if ( length(which(tabl == max(tabl))) > 1 ) return(NA)
    
    # Otherwise take the most frequent genotype
    which.max(tabl) %>% names() %>% as.numeric() }) 
  
  ## Calculate post-consensus missingness
  post_missing <- mean(is.na(consensus))
  
  # Return a list
  list(consensus = consensus, init_cor = init_cor, init_missing = init_missing, post_missing = post_missing) 
  
})

# Display the correlations
sapply(consensus_pars, function(comp) comp$init_cor %>% .[upper.tri(.)])


## Plot before/after missingness
consensus_missing <- consensus_pars %>% 
  setNames(., repeated_parents) %>%
  lapply("[", c("init_missing", "post_missing")) %>%
  map(., ~map(., mean) %>% as.data.frame()) %>%
  map2_df(.x = ., .y = names(.), ~mutate(.x, line_name = .y)) %>%
  gather(stage, missingness, -line_name)

## Plot
consensus_missing %>% 
  ggplot(aes(x = line_name, y = missingness, fill = stage)) +
  geom_col(position = position_dodge(0.9))
    


# Extract the non-repeated parent genotypes
non_repeated_parents <- s2c1_pedigree %>% 
  select(contains("parent")) %>% 
  unlist() %>% 
  unique() %>%
  setdiff(., repeated_parents)

non_repeated_genos <- lapply(X = non_repeated_parents, FUN = function(par) {
  
  # Find the families of that parent
  par_fam <- s2c1_pedigree %>% 
    filter_at(vars(contains("parent")), any_vars(. == par)) %>%
    pull(family) %>% 
    unique()
  
  # Pull out the imputed data for those parents
  fam_list[[par_fam]]$founders_genos[par,] }) %>%
  
  # Bind into a matrix
  do.call("rbind", .)

row.names(non_repeated_genos) <- non_repeated_parents

# Extract the genos from the consensus
consensus_genos <- lapply(X = consensus_pars, FUN = function(comp) comp$consensus) %>%
  do.call("rbind", .)
row.names(consensus_genos) <- repeated_parents

# Combine consensus and non-repeated genos
parents_recode_new <- rbind(consensus_genos, non_repeated_genos)


# Recode and combine with the SNP info
parents_imputed_hmp <- t(parents_recode_new - 1) %>%
  as.data.frame() %>%
  # Combine with SNP info
  bind_cols(select(geno_hmp2, 1:5), .)

# Parent names
parents <- parents_imputed_hmp %>%
  select(-`rs#`:-cM_pos) %>%
  names()

# Gather the information for the remaining TP individuals
non_parent_genos <- geno_hmp2 %>%
  select(`rs#`:cM_pos, which(names(.) %in% setdiff(s2tp, parents)))

# Merge and sort
s2tp_new_genos <- full_join(parents_imputed_hmp, non_parent_genos) %>%
  select(`rs#`:cM_pos, sort(names(.)))






### fastPHASE

# fastPHASE will be used to impute the marker genotypes in the TP

#### Preparation

# Recode for TASSEL
## Add the other variables required by TASSEL
s2tp_new_genos1 <- s2tp_new_genos %>% 
  mutate(strand = "+", `assembly#` = NA, center = NA, 
         protLSID = NA, assayLSID = NA, panel = NA, 
         QCcode = NA) %>%
  select(-cM_pos)

# Rearrange columns
s2tp_new_genos2 <- s2tp_new_genos1 %>%
  select(`rs#`:pos, strand:QCcode, names(.))

# Convert to TASSEL
genos_tassel <- s2tp_new_genos2 %>%
  apply(MARGIN = 1, FUN = function(snp) {
    alleles <- snp[2] %>%
      str_split("/") %>%
      unlist() %>%
      rev() %>% 
      # Duplicate each nucleotide twice
      str_dup(2) %>%
      # Concatenate the duplicate alleles, a reverse of the het, and NN for missing
      c(str_trunc(., 1, ellipsis = "") %>%
          rev() %>%
          str_c(collapse = ""), 
        "NN") # For hets and missing
    # Assign names to the alleles
    names(alleles) <- c('-1', '1', '0', "NA")
    
    # Change to character
    snp <- as.character(snp[-1:-11])
    
    snp %>%
      str_replace_na() %>%
      str_replace_all(alleles) %>%
      str_trim()
  })




# Recombined with the metadata
s2tp_new_genos3 <- s2tp_new_genos2 %>%
  select(`rs#`:QCcode) %>%
  bind_cols(., as.data.frame(t(genos_tassel))) %>%
  `names<-`(., names(s2tp_new_genos2)) %>%
  mutate_if(is.factor, as.character)

# Iterate over chromosomes
# # These files should be saved in the `imp_dir` directory (see above)
for (chr in unique(s2tp_new_genos3$chrom)) {
  
  s2tp_new_genos3 %>%
    filter(chrom == chr) %>%
    write_tsv(x = ., path = file.path(imp_dir, "fastPHASE", paste0("s2tp_imputed_genos_chr", chr, "_TASSEL_hmp.txt")))
}



# Output a file of population assignments
# Extract breeding program designation
program_code <- s2tp %>%
  str_extract("[A-Z]{1,2}[0-9]{0,1}") %>%
  # Transform to factor and then numeric
  as.factor() %>%
  as.numeric()

## Write lines to a file
## # These files should be saved in the `imp_dir` directory (see above)
con <- file(description = file.path(imp_dir, "fastPHASE", "s2tp_program_subpop_code.txt"))
writeLines(text = as.character(program_code), sep = " ", con = con)
close(con = con)




# Execute fastPHASE imputation

## This is done on the command line using shell

## Here is a sample command line to implement fastPHASE for the markers on chromsome 1.
## It can be altered to impute markers for the remaining chromosomes:
## 
## First, the 'TASSEL'-encoded marker genotypes need to be converted to the inpute for 
## fastPHASE. This can be accomplished using the python script tassel2fastPHASE.py from
## the GitHub page https://github.com/neyhartj/bioinformatic-utils. It is executed
## as such:
## 
## python tassel2fastPHASE.py -i s2tp_imputed_genos_chr1_TASSEL_hmp.txt -o s2tp_imputed_genos_chr1_fastphase
## 
## The output of this script is then fed into fastPHASE using the following command:
## 
## fastPHASE -o s2tp_fastphase_imputed_genos_chr1 -T10 -H200 -K10 -q0.7 -us2tp_program_subpop_code.txt s2tp_imputed_genos_chr1_fastphase.imp
## 
## Last, the output from fastPHASE must be converted back to "TASSEL" format. This is accomplished using
## the fastPHASE2tassel.py script in the GitHub page https://github.com/neyhartj/bioinformatic-utils. It is
## executed as such:
## 
## 	python fastPHASE2tassel.py -i s2tp_fastphase_imputed_genos_chr1_hapguess_switch.out \
## 	-t s2tp_imputed_genos_chr1_TASSEL_hmp.txt -f -o s2tp_fastphase_imputed_genos_chr1
##  



# Follow-Up

# Here we will manage the output from fastPHASE on the genotypes of the S2TP lines.
# We need to convert them into a manageable matrix, then combine with the S2C1 lines
# for EM imputation.

# Find the files and read in, then bind rows
## # These files should be saved in the `imp_dir` directory (see above)
s2tp_imputed_genos_hmp <- list.files(path = file.path(imp_dir, "fastPHASE"), full.names = T) %>%
  str_subset('fastphase_imputed_genos') %>%
  map(read_tsv) %>%
  map_df(~rename(., allele = alleles)) %>%
  left_join(select(s2tp_new_genos3, 1:4), .)

# Results from fsimpute
## # These files should be saved in the `imp_dir` directory (see above)
load(file.path(imp_dir, "fsimpute", "fsimpute_family_imputed.RData"))



# Convert to rrBLUP format
genos_rrblup <- s2tp_imputed_genos_hmp %>%
  apply(MARGIN = 1, FUN = function(snp) {
    
    # Grab the alleles
    alleles <- snp[2] %>% str_split("/") %>% unlist()
    # Create a dictionary for conversion
    allele.names <- alleles %>%
      # Duplicate each nucleotide
      str_dup(2) %>%
      # Concatenate the duplicate alleles, a reverse of the het, and NN for missing
      c(., 
        str_trunc(., 1, ellipsis = "") %>% 
          c(str_c(., collapse = ""), str_c(rev(.), collapse = "")), 
        "NN") %>% .[-3:-4]
    
    # Create the dictionary
    allele.dict <- c(1, -1, 0, 0, "NA")
    names(allele.dict) <- allele.names
    
    # Change to character
    snp <- as.character(snp[-1:-11])
    
    # Replace the SNP genotypes
    snp %>% str_replace_all(allele.dict) %>% parse_number() }) %>% t()

# Add names
colnames(genos_rrblup) <- s2tp_imputed_genos_hmp %>% 
  select(-`rs#`:-QCcode) %>%
  names()

# Merge with snp info
s2tp_imputed_genos_hmp1 <- bind_cols(filter(snp_info, `rs#` %in% s2tp_imputed_genos_hmp$`rs#`), as.data.frame(genos_rrblup))


## What is the missingness
marker_na <- s2tp_imputed_genos_hmp1 %>%
  select(-`rs#`:-cM_pos) %>%
  is.na() %>% 
  rowMeans()

boxplot(marker_na)


# Wow pretty good.

# Save
## # These files should be saved in the `imp_dir` directory (see above)
write_tsv(x = s2tp_imputed_genos_hmp1, path = file.path(imp_dir, "fastPHASE", "s2tp_fp_imputed_genos_hmp.txt"))

### Manage the progeny

# Gather the imputed progeny and row-bind
s2c1_imputed_genos <- lapply(X = fam_list, FUN = function(fam) fam$finals_genos) %>%
  do.call("rbind", .) %>%
  
  # Recode to -1, 0, 1
  {. - 1} %>%
  t()

# Merge with the snp info
s2c1_imputed_genos_hmp <- bind_cols(filter(snp_info, `rs#` %in% s2tp_imputed_genos_hmp$`rs#`), as.data.frame(s2c1_imputed_genos))

# Average missingness
marker_na <- s2c1_imputed_genos_hmp %>%
  select(-`rs#`:-cM_pos) %>%
  is.na() %>% 
  rowMeans()

boxplot(marker_na)

# Save
## # These files should be saved in the `imp_dir` directory (see above)
write_tsv(x = s2c1_imputed_genos_hmp, path = file.path(imp_dir, "fsimpute", "s2c1_imputed_genos_hmp.txt"))





### Expectation-Maximization

# Preparation

# First remove markers that are in very high LD ($r^2 \ge 0.95$)

# Read in the data
## # These files should be saved in the `imp_dir` directory (see above)
s2tp_imputed_genos_hmp <- read_tsv(file = file.path(imp_dir, "fastPHASE", "s2tp_fp_imputed_genos_hmp.txt"))

s2c1_imputed_genos_hmp <- read_tsv(file = file.path(imp_dir, "fsimpute", "s2c1_imputed_genos_hmp.txt"))

# Set a correlation threshold
corr_threshold <- 0.95

# Calculate correlation of adjacent markers in the TP
## First split by chromosome and convert to matrices
chrom_mat <- s2tp_imputed_genos_hmp %>%
  as.data.frame() %>%
  column_to_rownames("rs#") %>%
  split(.$chrom) %>%
  map(select, -allele:-cM_pos) %>%
  map(as.matrix) %>%
  map(t)


# Remove the first column and grab the diagonal (adjacent correlation), then find the absolute value and square it
adjacent_LD <- chrom_mat %>%
  map(cor, use = "pairwise.complete") %>%
  lapply("[", ,-1, drop = FALSE) %>%
  map(diag) %>%
  map(abs) %>%
  map(~.^2)

# Plot the distribution of these LD calculations
data_frame(chrom = names(adjacent_LD), adjacent_LD) %>% 
  unnest() %>% 
  qplot(x = adjacent_LD, data = ., facets = ~ chrom)

# Apply over the chrom_mat and adjacent cor
markers_keep <- list(chrom_mat, adjacent_LD) %>%
  pmap(function(cm, ac) {
    
    # Get the markers names
    marker_names <- colnames(cm)
    
    # Which markers are > the threshold or NA?
    exclude_mar <- which(ac >= corr_threshold | is.na(ac)) + 1
    
    # Exclude those markers
    marker_names[-exclude_mar] })

# Re-calculate LD
adjacent_LD1 <- list(chrom_mat, markers_keep) %>%
  pmap(function(cm, mk) cm[,mk] ) %>%
  map(cor, use = "pairwise.complete") %>%
  lapply("[", ,-1, drop = FALSE) %>%
  map(diag) %>%
  map(abs) %>%
  map(function(x) x^2)

# Re-plot
data_frame(chrom = names(adjacent_LD1), adjacent_LD1) %>% 
  unnest() %>% 
  qplot(x = adjacent_LD1, data = ., facets = ~ chrom)


# Unlist the kept markers
markers_keep <- unlist(markers_keep)






### Combine the datasets and prep for final imputation

# Merge
s2_semifinal_genos_hmp <- full_join(s2tp_imputed_genos_hmp, s2c1_imputed_genos_hmp) %>%
  filter(`rs#` %in% markers_keep)


# Run missingness stats
mar_missing <- s2_semifinal_genos_hmp %>% 
  select(-`rs#`:-cM_pos) %>% 
  is.na() %>% 
  rowMeans()

hist(mar_missing)


# Filter on missing data
s2_semifinal_genos_hmp1 <- s2_semifinal_genos_hmp %>%
  filter(mar_missing <= 0.80)


mar_missing1 <- s2_semifinal_genos_hmp1 %>% 
  select(-`rs#`:-cM_pos) %>% 
  is.na() %>% 
  rowMeans()

hist(mar_missing1)

# Plot marker distribution
s2_semifinal_genos_hmp1 %>% 
  qplot(x = pos, data = ., facets = ~ chrom, bins = 50)

s2_geno_mat <- s2_semifinal_genos_hmp1 %>%
  select(-`rs#`:-cM_pos) %>%
  t()
colnames(s2_geno_mat) <- s2_semifinal_genos_hmp1$`rs#`

# Save
## # These files should be saved in the `imp_dir` directory (see above)
save("s2_geno_mat", file = file.path(imp_dir, "s2_semifinal_genos.RData")) 

## # These files should be saved in the `final_dir` directory (see above)
write_tsv(x = s2_semifinal_genos_hmp1, path = file.path(final_dir, "s2_final_discrete_genos_hmp.txt"))


## Execution
 
# Load data for imputation
## # These files should be saved in the `imp_dir` directory (see above)
load(file.path(imp_dir, "s2_semifinal_genos.RData"))

# Run EM imputation
imputed <- A.mat(X = s2_geno_mat, min.MAF = 0, max.missing = 1, impute.method = "EM", return.imputed = T)$imputed

## # These files should be saved in the `imp_dir` directory (see above)
save("imputed", file = file.path(imp_dir, "s2_final_imputed_mat.RData"))



#### Follow-Up

# Load the imputed genotpyes
## # These files should be saved in the `imp_dir` directory (see above)
load(file.path(imp_dir, "EM", "s2_final_imputed_mat.RData"))

# Load the snp info
## # These files should be saved in the `final_dir` directory (see above)
s2_final_discrete_genos_hmp <- read_tsv(file = file.path(final_dir, "s2_final_discrete_genos_hmp.txt"))

# Extract the snp info
snp_info <- s2_final_discrete_genos_hmp %>% 
  select(`rs#`:cM_pos)


imputed <- imputed$imputed

# Transpose the matrix and merge with the SNP info
s2_imputed_genos <- imputed %>%
  t() %>%
  cbind(snp_info, .) %>%
  tbl_df()

# Save
## # These files should be saved in the `final_dir` directory (see above)
write_tsv(s2_imputed_genos, path = file.path(final_dir, "s2_final_imputed_genos_hmp.txt"))

# Load the discrete hapmap
s2_discrete_genos <- s2_final_discrete_genos_hmp

# Save as Rdata
## # These files should be saved in the `final_dir` directory (see above)
save("s2_discrete_genos", "s2_imputed_genos", file = file.path(final_dir, "s2_genos_hmp.RData"))

# Convert to useable matrices and save
s2_discrete_mat <- s2_discrete_genos %>%
  select(-`rs#`:-cM_pos) %>%
  t()
colnames(s2_discrete_mat) <- s2_discrete_genos$`rs#`

s2_imputed_mat <- s2_imputed_genos %>%
  select(-`rs#`:-cM_pos) %>%
  t()
colnames(s2_imputed_mat) <- s2_imputed_genos$`rs#`

## # These files should be saved in the `final_dir` directory (see above)
save("s2_discrete_mat", "s2_imputed_mat", "snp_info", file = file.path(final_dir, "s2_genos_mat.RData"))