## Genotype imputation ## ## This script outlines the imputation steps for the spring two-row genotyping-by ## -sequencing experiments. ## ## Author: Jeff Neyhart ## Last modified: 24 July 2019 ## # Load some packages and set some directories library(fsimpute) # Available from https://github.com/neyhartj/fsimpute library(tidyverse) library(qtl) library(rrBLUP) ## Set directories ## ## These need to be adjusted by the user prior to running the script ## # Location of the processed hapmap files from the `s2_marker_data_processing_use.R` script proc_geno_dir <- "path/to/folder/storing/processed/hapmap" # Location of physical/genetic interpolation data map_dir <- "path/to/folder/containing/map/interpolation" # Location of entry data with pedigree information entry_dir <- "path/to/folder/containing/pedigee" # Location of imputation data imp_dir <- "path/to/folder/storing/imputed/genotype/data" ## Final data directory final_dir <- "path/to/final/data/directory" # Read in the entry data # This CSV should be placed in the `entry_dir` directory (see above). entry_dat <- read_csv(file.path(entry_dir, "UMN_S2_pedigree.csv")) # Create character vectors with the S2TP lines, S2C1 lines, and S2C1.5 lines s2tp <- subset(entry_dat, population == "S2TP", line_name, drop = T) s2c1 <- subset(entry_dat, population == "S2C1", line_name, drop = T) s2c15 <- subset(entry_dat, population == "S2C1.5", line_name, drop = T) ## Read in the filtered genotype data for the S2TP and S2C1 ## These data files should be read in (the below command looks for both): ## s2tp_filtered_genos_hmp.txt ## s2c1_filtered_genos_hmp.txt ## ## Note: these files are created by the `s2_marker_data_processing_use.R` script ## ## geno_hmp <- list.files(proc_geno_dir, full.names = TRUE, pattern = "s2tp|s2c1_") %>% map(read_tsv) %>% Reduce(f = full_join, x = .) %>% arrange(chrom, pos) # Read in the interpolated genetic map data # This CSV should be the `map_dir` directory (see above) snp_info <- read_csv(file = file.path(map_dir, "GBS_marker_info.csv")) ### fsimpute # We first start with `fsimpute` to impute individuals using the known pedigree structure # Preparation # Merge the new SNP data with the genotype data, removing pesky chromosome 8 geno_hmp2 <- inner_join(geno_hmp, snp_info) %>% select(1:4, cM_pos, names(.)) # Make a map map <- geno_hmp2 %>% select(1:5) %>% split(.$chrom) %>% map(function(chr) structure(chr$cM_pos, names = chr$`rs#`, class = "A")) %>% structure(class = "map") # Jitter the map map <- jittermap(map) ## C1 parents c1_parents <- entry_dat %>% filter(population == "S2TP", parent) %>% pull(line_name) all(c1_parents %in% names(geno_hmp2)) ## All of them! Good ## Create a pedigree file for the S2C1 s2c1_pedigree <- entry_dat %>% filter(population == "S2C1") %>% separate(pedigree, c("parent1", "parent2"), sep = "/") %>% select(line_name, parent1, parent2) %>% mutate(family = str_extract(line_name, "3[0-9]{3}")) ## Execute imputation fam_list <- list() # Iterate over each family number for (fam in unique(s2c1_pedigree$family)) { # Get the parents and the progeny parents <- s2c1_pedigree %>% filter(family == fam) %>% select(contains("parent")) %>% distinct() %>% unlist() %>% sort() progeny <- s2c1_pedigree %>% filter(family == fam) %>% distinct(line_name) %>% unlist() %>% sort() # Subset the genotype matrix fam_genos <- geno_hmp2 %>% select(1:5, which(names(.) %in% c(parents, progeny))) # Separate the parent matrix and the progeny matrix and reformat founders_recode <- fam_genos %>% # Split by chrosome split(.$chrom) %>% # Apply a function map(function(chr) { chr.recode <- chr %>% select(which(names(.) %in% parents)) %>% t() %>% {. + 1} colnames(chr.recode) <- chr$`rs#` return(chr.recode) }) finals_recode <- fam_genos %>% # Split by chrosome split(.$chrom) %>% # Apply a function map(function(chr) { chr.recode <- chr %>% select(which(names(.) %in% progeny)) %>% t() %>% {. + 1} colnames(chr.recode) <- chr$`rs#` return(chr.recode) }) # Prep the cross fam_cross <- prep_cross(map = map, founders = founders_recode, finals = finals_recode, selfing.gen = 2) # Impute imputed_cross <- fsimpute(cross = fam_cross, prob.threshold = 0.7) # Reassemble the matrix fam_list[[fam]] <- list( founders_genos = lapply(X = imputed_cross$geno, FUN = function(chr) chr$founders_imputed) %>% do.call("cbind", .), finals_genos = lapply(X = imputed_cross$geno, FUN = function(chr) chr$finals_imputed) %>% do.call("cbind", .), missingness = imputed_cross$missingness_stats ) } # Save the list # This file should be saved in the `imp_dir` directory (see above) save("fam_list", file = file.path(imp_dir, "fsimpute", "fsimpute_family_imputed.RData")) #### Follow-Up # Load the data # These files should be located in the `imp_dir` directory (see above) load(file.path(imp_dir, "fsimpute", "fsimpute_family_imputed.RData") ) # Check the stats fam_missing <- lapply(X = names(fam_list), FUN = function(fam) { # Extract the data for that family fam_data <- fam_list[[fam]]$missingness # Add the family number as a variable fam_data %>% mutate(family = fam) %>% select(family, group:post_imputation) }) %>% # Row bind bind_rows() # Perform some analysis (g <- fam_missing %>% group_by(family, group) %>% summarize(Before = mean(pre_imputation), After = mean(post_imputation)) %>% # Gather gather(treatment, value, -family, -group) %>% ## Plot! ggplot(aes(family, value)) + geom_bar(aes(fill = treatment), position = "dodge", stat = "identity") + facet_wrap("group", nrow = 2) + xlab("Family") + ylab("Missing data proportion") + ylim(c(0, 1)) + theme( axis.text.x = element_text(angle = 90) ) ) # Here we will find consensus genotypes among the repeated parents and combine # with non-repeated parents. # Among repeated parents, check the concordance repeated_parents <- s2c1_pedigree %>% select(contains("parent")) %>% distinct() %>% unlist() %>% table() %>% subset(., . > 1) %>% names() # Apply a function across parents consensus_pars <- lapply(X = repeated_parents, FUN = function(par) { # Find the families of that parent par_fam <- s2c1_pedigree %>% filter_at(vars(contains("parent")), any_vars(. == par)) %>% pull(family) %>% unique() # Pull out the imputed data for those parents par_fam_impute <- sapply(X = par_fam, FUN = function(fam) { fam_list[[fam]]$founders_genos[par,] }) %>% t() ## Find the missingness per parent init_missing <- rowMeans(is.na(par_fam_impute)) # Correlate init_cor <- cor(t(par_fam_impute), use = "complete.obs") # Iterate over markers to create the consensus genotype consensus <- apply(X = par_fam_impute, MARGIN = 2, FUN = function(snp) { # Table of genotypes tabl <- table(snp) # If the length is 0 (all NA) return NA if (length(tabl) == 0) return(NA) # If there isn't a most frequent genotype, return NA if ( length(which(tabl == max(tabl))) > 1 ) return(NA) # Otherwise take the most frequent genotype which.max(tabl) %>% names() %>% as.numeric() }) ## Calculate post-consensus missingness post_missing <- mean(is.na(consensus)) # Return a list list(consensus = consensus, init_cor = init_cor, init_missing = init_missing, post_missing = post_missing) }) # Display the correlations sapply(consensus_pars, function(comp) comp$init_cor %>% .[upper.tri(.)]) ## Plot before/after missingness consensus_missing <- consensus_pars %>% setNames(., repeated_parents) %>% lapply("[", c("init_missing", "post_missing")) %>% map(., ~map(., mean) %>% as.data.frame()) %>% map2_df(.x = ., .y = names(.), ~mutate(.x, line_name = .y)) %>% gather(stage, missingness, -line_name) ## Plot consensus_missing %>% ggplot(aes(x = line_name, y = missingness, fill = stage)) + geom_col(position = position_dodge(0.9)) # Extract the non-repeated parent genotypes non_repeated_parents <- s2c1_pedigree %>% select(contains("parent")) %>% unlist() %>% unique() %>% setdiff(., repeated_parents) non_repeated_genos <- lapply(X = non_repeated_parents, FUN = function(par) { # Find the families of that parent par_fam <- s2c1_pedigree %>% filter_at(vars(contains("parent")), any_vars(. == par)) %>% pull(family) %>% unique() # Pull out the imputed data for those parents fam_list[[par_fam]]$founders_genos[par,] }) %>% # Bind into a matrix do.call("rbind", .) row.names(non_repeated_genos) <- non_repeated_parents # Extract the genos from the consensus consensus_genos <- lapply(X = consensus_pars, FUN = function(comp) comp$consensus) %>% do.call("rbind", .) row.names(consensus_genos) <- repeated_parents # Combine consensus and non-repeated genos parents_recode_new <- rbind(consensus_genos, non_repeated_genos) # Recode and combine with the SNP info parents_imputed_hmp <- t(parents_recode_new - 1) %>% as.data.frame() %>% # Combine with SNP info bind_cols(select(geno_hmp2, 1:5), .) # Parent names parents <- parents_imputed_hmp %>% select(-`rs#`:-cM_pos) %>% names() # Gather the information for the remaining TP individuals non_parent_genos <- geno_hmp2 %>% select(`rs#`:cM_pos, which(names(.) %in% setdiff(s2tp, parents))) # Merge and sort s2tp_new_genos <- full_join(parents_imputed_hmp, non_parent_genos) %>% select(`rs#`:cM_pos, sort(names(.))) ### fastPHASE # fastPHASE will be used to impute the marker genotypes in the TP #### Preparation # Recode for TASSEL ## Add the other variables required by TASSEL s2tp_new_genos1 <- s2tp_new_genos %>% mutate(strand = "+", `assembly#` = NA, center = NA, protLSID = NA, assayLSID = NA, panel = NA, QCcode = NA) %>% select(-cM_pos) # Rearrange columns s2tp_new_genos2 <- s2tp_new_genos1 %>% select(`rs#`:pos, strand:QCcode, names(.)) # Convert to TASSEL genos_tassel <- s2tp_new_genos2 %>% apply(MARGIN = 1, FUN = function(snp) { alleles <- snp[2] %>% str_split("/") %>% unlist() %>% rev() %>% # Duplicate each nucleotide twice str_dup(2) %>% # Concatenate the duplicate alleles, a reverse of the het, and NN for missing c(str_trunc(., 1, ellipsis = "") %>% rev() %>% str_c(collapse = ""), "NN") # For hets and missing # Assign names to the alleles names(alleles) <- c('-1', '1', '0', "NA") # Change to character snp <- as.character(snp[-1:-11]) snp %>% str_replace_na() %>% str_replace_all(alleles) %>% str_trim() }) # Recombined with the metadata s2tp_new_genos3 <- s2tp_new_genos2 %>% select(`rs#`:QCcode) %>% bind_cols(., as.data.frame(t(genos_tassel))) %>% `names<-`(., names(s2tp_new_genos2)) %>% mutate_if(is.factor, as.character) # Iterate over chromosomes # # These files should be saved in the `imp_dir` directory (see above) for (chr in unique(s2tp_new_genos3$chrom)) { s2tp_new_genos3 %>% filter(chrom == chr) %>% write_tsv(x = ., path = file.path(imp_dir, "fastPHASE", paste0("s2tp_imputed_genos_chr", chr, "_TASSEL_hmp.txt"))) } # Output a file of population assignments # Extract breeding program designation program_code <- s2tp %>% str_extract("[A-Z]{1,2}[0-9]{0,1}") %>% # Transform to factor and then numeric as.factor() %>% as.numeric() ## Write lines to a file ## # These files should be saved in the `imp_dir` directory (see above) con <- file(description = file.path(imp_dir, "fastPHASE", "s2tp_program_subpop_code.txt")) writeLines(text = as.character(program_code), sep = " ", con = con) close(con = con) # Execute fastPHASE imputation ## This is done on the command line using shell ## Here is a sample command line to implement fastPHASE for the markers on chromsome 1. ## It can be altered to impute markers for the remaining chromosomes: ## ## First, the 'TASSEL'-encoded marker genotypes need to be converted to the inpute for ## fastPHASE. This can be accomplished using the python script tassel2fastPHASE.py from ## the GitHub page https://github.com/neyhartj/bioinformatic-utils. It is executed ## as such: ## ## python tassel2fastPHASE.py -i s2tp_imputed_genos_chr1_TASSEL_hmp.txt -o s2tp_imputed_genos_chr1_fastphase ## ## The output of this script is then fed into fastPHASE using the following command: ## ## fastPHASE -o s2tp_fastphase_imputed_genos_chr1 -T10 -H200 -K10 -q0.7 -us2tp_program_subpop_code.txt s2tp_imputed_genos_chr1_fastphase.imp ## ## Last, the output from fastPHASE must be converted back to "TASSEL" format. This is accomplished using ## the fastPHASE2tassel.py script in the GitHub page https://github.com/neyhartj/bioinformatic-utils. It is ## executed as such: ## ## python fastPHASE2tassel.py -i s2tp_fastphase_imputed_genos_chr1_hapguess_switch.out \ ## -t s2tp_imputed_genos_chr1_TASSEL_hmp.txt -f -o s2tp_fastphase_imputed_genos_chr1 ## # Follow-Up # Here we will manage the output from fastPHASE on the genotypes of the S2TP lines. # We need to convert them into a manageable matrix, then combine with the S2C1 lines # for EM imputation. # Find the files and read in, then bind rows ## # These files should be saved in the `imp_dir` directory (see above) s2tp_imputed_genos_hmp <- list.files(path = file.path(imp_dir, "fastPHASE"), full.names = T) %>% str_subset('fastphase_imputed_genos') %>% map(read_tsv) %>% map_df(~rename(., allele = alleles)) %>% left_join(select(s2tp_new_genos3, 1:4), .) # Results from fsimpute ## # These files should be saved in the `imp_dir` directory (see above) load(file.path(imp_dir, "fsimpute", "fsimpute_family_imputed.RData")) # Convert to rrBLUP format genos_rrblup <- s2tp_imputed_genos_hmp %>% apply(MARGIN = 1, FUN = function(snp) { # Grab the alleles alleles <- snp[2] %>% str_split("/") %>% unlist() # Create a dictionary for conversion allele.names <- alleles %>% # Duplicate each nucleotide str_dup(2) %>% # Concatenate the duplicate alleles, a reverse of the het, and NN for missing c(., str_trunc(., 1, ellipsis = "") %>% c(str_c(., collapse = ""), str_c(rev(.), collapse = "")), "NN") %>% .[-3:-4] # Create the dictionary allele.dict <- c(1, -1, 0, 0, "NA") names(allele.dict) <- allele.names # Change to character snp <- as.character(snp[-1:-11]) # Replace the SNP genotypes snp %>% str_replace_all(allele.dict) %>% parse_number() }) %>% t() # Add names colnames(genos_rrblup) <- s2tp_imputed_genos_hmp %>% select(-`rs#`:-QCcode) %>% names() # Merge with snp info s2tp_imputed_genos_hmp1 <- bind_cols(filter(snp_info, `rs#` %in% s2tp_imputed_genos_hmp$`rs#`), as.data.frame(genos_rrblup)) ## What is the missingness marker_na <- s2tp_imputed_genos_hmp1 %>% select(-`rs#`:-cM_pos) %>% is.na() %>% rowMeans() boxplot(marker_na) # Wow pretty good. # Save ## # These files should be saved in the `imp_dir` directory (see above) write_tsv(x = s2tp_imputed_genos_hmp1, path = file.path(imp_dir, "fastPHASE", "s2tp_fp_imputed_genos_hmp.txt")) ### Manage the progeny # Gather the imputed progeny and row-bind s2c1_imputed_genos <- lapply(X = fam_list, FUN = function(fam) fam$finals_genos) %>% do.call("rbind", .) %>% # Recode to -1, 0, 1 {. - 1} %>% t() # Merge with the snp info s2c1_imputed_genos_hmp <- bind_cols(filter(snp_info, `rs#` %in% s2tp_imputed_genos_hmp$`rs#`), as.data.frame(s2c1_imputed_genos)) # Average missingness marker_na <- s2c1_imputed_genos_hmp %>% select(-`rs#`:-cM_pos) %>% is.na() %>% rowMeans() boxplot(marker_na) # Save ## # These files should be saved in the `imp_dir` directory (see above) write_tsv(x = s2c1_imputed_genos_hmp, path = file.path(imp_dir, "fsimpute", "s2c1_imputed_genos_hmp.txt")) ### Expectation-Maximization # Preparation # First remove markers that are in very high LD ($r^2 \ge 0.95$) # Read in the data ## # These files should be saved in the `imp_dir` directory (see above) s2tp_imputed_genos_hmp <- read_tsv(file = file.path(imp_dir, "fastPHASE", "s2tp_fp_imputed_genos_hmp.txt")) s2c1_imputed_genos_hmp <- read_tsv(file = file.path(imp_dir, "fsimpute", "s2c1_imputed_genos_hmp.txt")) # Set a correlation threshold corr_threshold <- 0.95 # Calculate correlation of adjacent markers in the TP ## First split by chromosome and convert to matrices chrom_mat <- s2tp_imputed_genos_hmp %>% as.data.frame() %>% column_to_rownames("rs#") %>% split(.$chrom) %>% map(select, -allele:-cM_pos) %>% map(as.matrix) %>% map(t) # Remove the first column and grab the diagonal (adjacent correlation), then find the absolute value and square it adjacent_LD <- chrom_mat %>% map(cor, use = "pairwise.complete") %>% lapply("[", ,-1, drop = FALSE) %>% map(diag) %>% map(abs) %>% map(~.^2) # Plot the distribution of these LD calculations data_frame(chrom = names(adjacent_LD), adjacent_LD) %>% unnest() %>% qplot(x = adjacent_LD, data = ., facets = ~ chrom) # Apply over the chrom_mat and adjacent cor markers_keep <- list(chrom_mat, adjacent_LD) %>% pmap(function(cm, ac) { # Get the markers names marker_names <- colnames(cm) # Which markers are > the threshold or NA? exclude_mar <- which(ac >= corr_threshold | is.na(ac)) + 1 # Exclude those markers marker_names[-exclude_mar] }) # Re-calculate LD adjacent_LD1 <- list(chrom_mat, markers_keep) %>% pmap(function(cm, mk) cm[,mk] ) %>% map(cor, use = "pairwise.complete") %>% lapply("[", ,-1, drop = FALSE) %>% map(diag) %>% map(abs) %>% map(function(x) x^2) # Re-plot data_frame(chrom = names(adjacent_LD1), adjacent_LD1) %>% unnest() %>% qplot(x = adjacent_LD1, data = ., facets = ~ chrom) # Unlist the kept markers markers_keep <- unlist(markers_keep) ### Combine the datasets and prep for final imputation # Merge s2_semifinal_genos_hmp <- full_join(s2tp_imputed_genos_hmp, s2c1_imputed_genos_hmp) %>% filter(`rs#` %in% markers_keep) # Run missingness stats mar_missing <- s2_semifinal_genos_hmp %>% select(-`rs#`:-cM_pos) %>% is.na() %>% rowMeans() hist(mar_missing) # Filter on missing data s2_semifinal_genos_hmp1 <- s2_semifinal_genos_hmp %>% filter(mar_missing <= 0.80) mar_missing1 <- s2_semifinal_genos_hmp1 %>% select(-`rs#`:-cM_pos) %>% is.na() %>% rowMeans() hist(mar_missing1) # Plot marker distribution s2_semifinal_genos_hmp1 %>% qplot(x = pos, data = ., facets = ~ chrom, bins = 50) s2_geno_mat <- s2_semifinal_genos_hmp1 %>% select(-`rs#`:-cM_pos) %>% t() colnames(s2_geno_mat) <- s2_semifinal_genos_hmp1$`rs#` # Save ## # These files should be saved in the `imp_dir` directory (see above) save("s2_geno_mat", file = file.path(imp_dir, "s2_semifinal_genos.RData")) ## # These files should be saved in the `final_dir` directory (see above) write_tsv(x = s2_semifinal_genos_hmp1, path = file.path(final_dir, "s2_final_discrete_genos_hmp.txt")) ## Execution # Load data for imputation ## # These files should be saved in the `imp_dir` directory (see above) load(file.path(imp_dir, "s2_semifinal_genos.RData")) # Run EM imputation imputed <- A.mat(X = s2_geno_mat, min.MAF = 0, max.missing = 1, impute.method = "EM", return.imputed = T)$imputed ## # These files should be saved in the `imp_dir` directory (see above) save("imputed", file = file.path(imp_dir, "s2_final_imputed_mat.RData")) #### Follow-Up # Load the imputed genotpyes ## # These files should be saved in the `imp_dir` directory (see above) load(file.path(imp_dir, "EM", "s2_final_imputed_mat.RData")) # Load the snp info ## # These files should be saved in the `final_dir` directory (see above) s2_final_discrete_genos_hmp <- read_tsv(file = file.path(final_dir, "s2_final_discrete_genos_hmp.txt")) # Extract the snp info snp_info <- s2_final_discrete_genos_hmp %>% select(`rs#`:cM_pos) imputed <- imputed$imputed # Transpose the matrix and merge with the SNP info s2_imputed_genos <- imputed %>% t() %>% cbind(snp_info, .) %>% tbl_df() # Save ## # These files should be saved in the `final_dir` directory (see above) write_tsv(s2_imputed_genos, path = file.path(final_dir, "s2_final_imputed_genos_hmp.txt")) # Load the discrete hapmap s2_discrete_genos <- s2_final_discrete_genos_hmp # Save as Rdata ## # These files should be saved in the `final_dir` directory (see above) save("s2_discrete_genos", "s2_imputed_genos", file = file.path(final_dir, "s2_genos_hmp.RData")) # Convert to useable matrices and save s2_discrete_mat <- s2_discrete_genos %>% select(-`rs#`:-cM_pos) %>% t() colnames(s2_discrete_mat) <- s2_discrete_genos$`rs#` s2_imputed_mat <- s2_imputed_genos %>% select(-`rs#`:-cM_pos) %>% t() colnames(s2_imputed_mat) <- s2_imputed_genos$`rs#` ## # These files should be saved in the `final_dir` directory (see above) save("s2_discrete_mat", "s2_imputed_mat", "snp_info", file = file.path(final_dir, "s2_genos_mat.RData"))