## Spring two-row GBS marker data processing ## ## This script will take GBS data for the spring two-row (S2) barley population, ## perform filtration and imputation, and output processed files. ## ## Author: Jeff Neyhart ## Last Modified: 24 July 2019 ## # Libraries library(tidyverse) ## Functions # Function to measure marker/line missingness measure.missing <- function(x, type = c("numeric", "nucleotide")) { if (type == "numeric") { return( sum(is.na(x)) / length(x) ) } if (type == "nucleotide") { return( sum(x == "NN") / length(x) ) } } # Function to measure marker/line heterozygosity measure.het <- function(x, type = "numeric") { if (type == "numeric") { snp.recode <- na.omit(x) return( sum(snp.recode == 0) / length(snp.recode) ) } if (type == "nucleotide") { snp.recode <- x[x != "NN"] n.het <- sum(sapply(X = strsplit(snp.recode, ""), FUN = function(site) length(unique(site)) == 2)) return( n.het / length(snp.recode) ) } } ## Set directories ## ## These need to be adjusted by the user prior to running the script ## # Location of hmp file that was converted from the VCF file geno_dir <- "path/to/folder/containing/raw/hapmap" # Location to store the processed hapmap files proc_geno_dir <- "path/to/folder/storing/processed/hapmap" # Location of entry data with pedigree information entry_dir <- "path/to/folder/containing/pedigee" # Read in the entry data # This CSV should be placed in the `entry_dir` directory (see above). entry_dat <- read_csv(file.path(entry_dir, "UMN_S2_pedigree.csv")) # Create character vectors with the S2TP lines, S2C1 lines, and S2C1.5 lines s2tp <- subset(entry_dat, population == "S2TP", line_name, drop = T) s2c1 <- subset(entry_dat, population == "S2C1", line_name, drop = T) s2c15 <- subset(entry_dat, population == "S2C1.5", line_name, drop = T) ## Read in the HapMap file with all samples and variants # This file should be saved in the `geno_dir` directory (see above). gbs_hmp <- read_tsv(file = file.path(geno_dir, "2row_GBS_final_snps_newpos_renamed_hmp.txt")) %>% mutate_if(is.logical, as.numeric) ## Prepare data for each specific population ### S2TP # Subset the larger hapmap for S2TP lines (and duplicates) s2tp_gbs_hmp <- gbs_hmp %>% select(1:4, which(str_detect(names(.), "^[0-9]{2}"))) # In this dataset, several entries in the training population were genotyped two # or three times. Since these lines are highly inbred and homogenous, it is useful # to combine the data across these replicates. # # The procedure will be: 1. Find the duplicate with the lowest missing # data and keep that one or 2. Combine the marker data in duplicates # First find duplicates dup_entries <- names(s2tp_gbs_hmp) %>% str_replace_all(string = ., pattern = "\\.[0-9]*$", "") %>% {table(.) > 1} %>% which() %>% names() # Define a function to take a genotype matrix and perform site-wise tests of agreement # Returns the proportion of agreement among non-missing sites identical_sites <- function(genos) { site.compare <- apply(X = genos, MARGIN = 1, FUN = function(site) { if (any(is.na(site))) { return(NA) } else { if (length(unique(as.character(site))) == 1) { return(1) } else { return(0) }} }) sum(site.compare, na.rm = T) / sum(!is.na(site.compare)) } # Iterate over duplicates and find the number of non-missing site pairs that are the same dup_comparison <- lapply(X = dup_entries, FUN = function(ent) { ent_genos <- s2tp_gbs_hmp %>% select(matches(ent)) # Perform an agreement test on all duplicates prop_sites_identical <- identical_sites(ent_genos) # If there are more than 2 duplicates, also perform pairwise concurrence tests if (ncol(ent_genos) > 2) { comb_mat <- combn(colnames(ent_genos), 2) %>% t() %>% as.data.frame() %>% rename_all(~c("entry1", "entry2")) # Iterate over the combinations pairwise_conc <- apply(X = comb_mat, MARGIN = 1, FUN = function(combo) identical_sites(genos = ent_genos[,combo]) ) comb_mat <- comb_mat %>% mutate(pairwise_conc = pairwise_conc) } else { comb_mat <- NA } return(list(prop_sites_identical = prop_sites_identical, comb_mat = comb_mat)) }) names(dup_comparison) <- dup_entries # The result is a list of comparisons only for those lines that are duplicated. If # we look at the third duplicate, we see that more than 99% of sites are identical # between the duplicates. dup_comparison[[3]] # There are others where two duplicates are present (i.e. the entry was genotyped # three times). Sometimes, one of the three is an outlier, so the pairwise comparisons # of site agreement can be thrown off. Here is an instance where the entry `08N2-77` # was probably poorly genotyped dup_comparison$`08N2-77` # Other times there is high agreement among all three duplicates dup_comparison$`06N2-15` # If we look at all of the comparisons, we can see that most have very high agreement. hist(sapply(X = dup_comparison, FUN = function(comp) comp[[1]])) # Now we will go through and merge the duplicates that are in high agreement. The # procedure for doing this is as follows: 1. Determine among the duplicates which # has the lowest heterozygosity. Entries with high heterozygosity are likely errors, # since the lines were inbred to at least the F_6 generation, 2. Iterate over the # duplicates and apply these steps: a. if the proportion of agreement is >= 0.95, # iterate over sites, b. if sites among all duplicates are missing, output missing, # c. if one duplicate is missing, but the other is not, output the non-missing call, # d. if both duplicates are non-missing and agree, output that call, e. if both # duplicates are non-missing but do not agree, prioritize the duplicate with the # lowest heterozygosity, f. if the agreement proportion is not >= 0.95, output all # of the genotype information from the duplicate with the lowest heterozygosity. # Set a comparison cutoff for merging merge_cutoff = 0.95 # Let's see how many of the comparisons meet that cutoff sum(map_dbl(dup_comparison, "prop_sites_identical") >= merge_cutoff) # Create merged genotype data from duplicates merged_genos <- lapply(X = dup_entries, FUN = function(ent) { # Pull out the comparison data ent_comparison <- dup_comparison[[ent]] # Pull out the genotype data ent_genos <- s2tp_gbs_hmp %>% select(matches(ent)) # Determine which entry has the lowest heterozygosity # Measure missingness (as number of missing) and hets (as number of hets) ent_stats <- apply(X = ent_genos, MARGIN = 2, FUN = function(entry) { missing <- measure.missing(entry) het <- measure.het(entry) cbind(missing = missing, het = het) }) # Measure heterozygosity # Proportion of hets per non-missing site least_het <- which.min(ent_stats[2,]) # If the comb_mat is empty, look for the comparison across all duplicates if (all(is.na(ent_comparison$comb_mat))) { # If the level of concordance is >= 0.95, then merge if (ent_comparison$prop_sites_identical >= merge_cutoff) { apply(X = ent_genos, MARGIN = 1, FUN = function(site) { # If both are missing, output missing if (all(is.na(site))) { return(NA) } else { # If one site over the entries is non-missing, assign that site if (sum(!is.na(site)) == 1) { return(site[!is.na(site)]) } else { # The only other option is that both sites are non-missing # If the sites are the same, output that call if ((length(unique(as.character(site))) == 1)) { return(site[1]) # If not return the call from the entry with the least het } else { return(site[least_het]) }}}}) # If the concordance is less than 0.95, pick the entry will the least het } else { return(ent_genos[,least_het]) } # The other scenario is that the entry was duplicated > 1 (i.e. 3 or more) } else { # If all comparisons are greater than 0.95, merge together if (all(ent_comparison$comb_mat[,"pairwise_conc"] >= merge_cutoff)) { apply(X = ent_genos, MARGIN = 1, FUN = function(site) { # If both are missing, output missing if (all(is.na(site))) { return(NA) } else { # If one site over the entries is non-missing, assign that site if (sum(!is.na(site)) == 1) { return(site[!is.na(site)]) } else { # The only other option is that both sites are non-missing # If the sites are the same, output that call if ((length(unique(as.character(site))) == 1)) { return(site[1]) # If not return the call from the entry with the least het } else { return(site[least_het]) }}}}) # If not all of the concordance comparisons are >= the merge cutoff, find ## comparisons with the greatest concordance above the cutoff } else { good_comparison <- subset(ent_comparison$comb_mat, pairwise_conc >= merge_cutoff) # Find the good entries good_ent <- as.character(unlist(select(good_comparison, -pairwise_conc), use.names = FALSE)) # Subset the ent_genos matrix good_ent_genos <- ent_genos[,good_ent] ent_stats <- apply(X = good_ent_genos, MARGIN = 2, FUN = function(entry) { missing <- measure.missing(entry) het <- measure.het(entry) cbind(missing, het) }) # Measure heterozygosity # Proportion of hets per non-missing site least_het <- which.min(ent_stats[2,]) # Perform the regular merging procedure apply(X = good_ent_genos, MARGIN = 1, FUN = function(site) { # If both are missing, output missing if (all(is.na(site))) { return(NA) } else { # If one site over the entries is non-missing, assign that site if (sum(!is.na(site)) == 1) { return(site[!is.na(site)]) } else { # The only other option is that both sites are non-missing # If the sites are the same, output that call if ((length(unique(as.character(site))) == 1)) { return(site[1]) # If not return the call from the entry with the least het } else { return(site[least_het]) }}}}) }}}); names(merged_genos) <- dup_entries # Create a matrix merged_dups <- do.call("cbind", merged_genos) # Rename colnames(merged_dups) <- str_remove(colnames(merged_dups), "\\.[0-9]*$") # Now we can merge all the data together for the S2TP # Combine the hapmap info, the genotypes from non-duplicated entries, and the ## merged / unmerged genotypes from the duplicated entries s2tp_gbs_hmp1 <- s2tp_gbs_hmp %>% select(which(!str_detect(string = names(s2tp_gbs_hmp), pattern = str_c(dup_entries, collapse = "|")))) %>% bind_cols(merged_dups) %>% select(1:4, sort(names(.))) # # The next step to prepare the data for imputation is to filter the marker dataset # for extreme missingness and minor allele frequency. # SNP missingness snp_missingness <- s2tp_gbs_hmp1 %>% select(-`rs#`:-pos) %>% is.na() %>% rowMeans() hist(snp_missingness) # Remove complete missing snps s2tp_gbs_hmp2 <- s2tp_gbs_hmp1 %>% filter(snp_missingness < 1) # Remeasure missingness snp_missingness1 <- s2tp_gbs_hmp2 %>% select(-`rs#`:-pos) %>% is.na() %>% rowMeans() hist(snp_missingness1) # Measure maf snp_maf <- s2tp_gbs_hmp2 %>% select(-`rs#`:-pos) %>% {. + 1 } %>% # Add one to get the number of 1 alleles at each site rowMeans(na.rm = T) %>% # Divide by 2 to get the frequency {. / 2} %>% pmin(., 1 - .) # SFS hist(snp_maf) # Measure SNP heterozygosity snp_het <- s2tp_gbs_hmp2 %>% select(-`rs#`:-pos) %>% apply(MARGIN = 1, FUN = function(snp) sum(snp == 0, na.rm = T) / length(snp)) hist(snp_het) # This looks really good # Establish cutoffs for missingness and MAF # Filter SNPs on a heterzygostiy filter of 0.08, which is close to the expected heterozygosity # of F_5 plants (0.0625) to_keep <- which(snp_maf >= 0.05 & snp_missingness1 <= 0.80 & snp_het <= 0.08) # Filter the dataset s2tp_gbs_hmp3 <- s2tp_gbs_hmp2[to_keep,] # Measure the entry missingness entry_missing <- s2tp_gbs_hmp3 %>% select(-`rs#`:-pos) %>% is.na() %>% colMeans() hist(entry_missing) # Measure entry heterzygosity entry_het <- s2tp_gbs_hmp3 %>% summarise_at(vars(-`rs#`:-pos), list(~sum(. == 0, na.rm = T) / length(.))) %>% as.numeric() hist(entry_het) # Filter entries for missingness to_keep <- which(entry_missing <= 0.80) # Get the entry list line_names <- s2tp_gbs_hmp3 %>% select(-`rs#`:-pos) %>% names() # Looks like we lost 8 entries (lines_to_remove <- setdiff(line_names, line_names[to_keep])) s2tp_gbs_hmp4 <- s2tp_gbs_hmp3 %>% select(which(!names(.) %in% lines_to_remove)) # Remeasure SNP missingness, MAF snp_maf1 <- s2tp_gbs_hmp4 %>% select(-`rs#`:-pos) %>% {. + 1 } %>% # Add one to get the number of 1 alleles at each site rowMeans(na.rm = T) %>% # Divide by 2 to get the frequency {. / 2} %>% pmin(., 1 - .) hist(snp_maf1) snp_missingness2 <- s2tp_gbs_hmp4 %>% select(-`rs#`:-pos) %>% is.na() %>% rowMeans() hist(snp_missingness2) # Save this file # This file should be saved in the `proc_geno_dir` directory (see above). write_tsv(x = s2tp_gbs_hmp4, path = file.path(proc_geno_dir, "s2tp_filtered_genos_hmp.txt")) ### S2C1 # The genotypes for the S2C1 are nicely structured: we have 40 families of 30 # individuals each. We need to perform some filtering before running imputation. # Subset the larger hapmap for S2C1 lines s2c1_gbs_hmp <- gbs_hmp %>% select(1:4, which(names(.) %in% s2c1)) # The next step to prepare the data for imputation is to filter the marker dataset # for extreme missingness and minor allele frequency. # SNP missingness snp_missingness <- s2c1_gbs_hmp %>% select(-`rs#`:-pos) %>% is.na() %>% rowMeans() hist(snp_missingness) # Remove complete missing snps s2c1_gbs_hmp2 <- s2c1_gbs_hmp %>% filter(snp_missingness < 1) # Remeasure missingness snp_missingness1 <- s2c1_gbs_hmp2 %>% select(-`rs#`:-pos) %>% is.na() %>% rowMeans() hist(snp_missingness1) # Measure maf snp_maf <- s2c1_gbs_hmp2 %>% select(-`rs#`:-pos) %>% {. + 1 } %>% # Add one to get the number of 1 alleles at each site rowMeans(na.rm = T) %>% # Divide by 2 to get the frequency {. / 2} %>% pmin(., 1 - .) # SFS hist(snp_maf) # Measure SNP heterozygosity snp_het <- s2c1_gbs_hmp2 %>% select(-`rs#`:-pos) %>% apply(MARGIN = 1, FUN = function(snp) sum(snp == 0, na.rm = T) / length(snp)) hist(snp_het) # This looks really good # Establish cutoffs for missingness and MAF # Filter SNPs on a heterzygostiy filter of 0.35 to_keep <- which(snp_maf >= 0.02 & snp_missingness1 <= 0.80 & snp_het <= 0.35) # Filter the dataset s2c1_gbs_hmp3 <- s2c1_gbs_hmp2[to_keep,] # Measure the entry missingness entry_missing <- s2c1_gbs_hmp3 %>% select(-`rs#`:-pos) %>% is.na() %>% colMeans() hist(entry_missing) # Measure entry heterzygosity entry_het <- s2c1_gbs_hmp3 %>% summarise_at(vars(-`rs#`:-pos), funs(sum(. == 0, na.rm = T) / length(.))) %>% as.numeric() hist(entry_het) # Filter entries for missingness and heterozygosity to_keep <- which(entry_missing <= 0.80 & entry_het <= 0.35) # Get the entry list line_names <- s2c1_gbs_hmp3 %>% select(-`rs#`:-pos) %>% names() # Looks like we lost 86 entries (lines_to_remove <- setdiff(line_names, line_names[to_keep])) s2c1_gbs_hmp4 <- s2c1_gbs_hmp3 %>% select(which(!names(.) %in% lines_to_remove)) # Remeasure SNP missingness, MAF snp_maf1 <- s2c1_gbs_hmp4 %>% select(-`rs#`:-pos) %>% {. + 1 } %>% # Add one to get the number of 1 alleles at each site rowMeans(na.rm = T) %>% # Divide by 2 to get the frequency {. / 2} %>% pmin(., 1 - .) hist(snp_maf1) snp_missingness2 <- s2c1_gbs_hmp4 %>% select(-`rs#`:-pos) %>% is.na() %>% rowMeans() hist(snp_missingness2) # Save this file # This file should be saved in the `proc_geno_dir` directory (see above). write_tsv(x = s2c1_gbs_hmp4, path = file.path(proc_geno_dir, "s2c1_filtered_genos_hmp.txt")) ### S2C1.5 # Subset the larger hapmap for S2C1.5 lines s2c15_gbs_hmp <- gbs_hmp %>% select(1:4, which(names(.) %in% s2c15)) # Again we don't need to combine any data, so we can # simply delve into quality measurements and filters ## SNP missingness # First let's look at snp missingness snp.missingness <- s2c15_gbs_hmp %>% select(-`rs#`:-pos) %>% is.na() %>% rowMeans() hist(snp.missingness) ## Remove completely missing SNPs and remeasure missingness # Remove completely missing SNPs s2c15_gbs_hmp1 <- s2c15_gbs_hmp %>% filter(snp.missingness < 1) # Remeasure missingness, and also measure MAF snp.missingness1 <- s2c15_gbs_hmp1 %>% select(-`rs#`:-pos) %>% is.na() %>% rowMeans() hist(snp.missingness1) ## Measure SNP MAF # MAF snp.maf <- s2c15_gbs_hmp1 %>% select(-`rs#`:-pos) %>% + 1 %>% # Add one to get the number of 1 alleles at each site rowMeans(na.rm = T) %>% # Divide by 2 to get the frequency {. / 2} %>% sapply(FUN = function(freq) min(freq, 1 - freq)) hist(snp.maf) ## Measure SNP heterozygosity snp.het <- s2c15_gbs_hmp1 %>% select(-`rs#`:-pos) %>% apply(MARGIN = 1, FUN = function(snp) sum(snp == 0, na.rm = T) / sum(!is.na(snp))) hist(snp.het) ## Remove SNPs with high missigness, very low MAF, and very high het snps.to.keep <- which(snp.maf >= 0.1 & snp.het <= 0.35 & snp.missingness1 <= 0.8) # Keep the designated snps s2c15_gbs_hmp2 <- s2c15_gbs_hmp1 %>% slice(snps.to.keep) # Remeasure missingness snp.missingness2 <- s2c15_gbs_hmp2 %>% select(-`rs#`:-pos) %>% is.na() %>% rowMeans() hist(snp.missingness2) ## Remeasure MAF snp.maf1 <- s2c15_gbs_hmp2 %>% select(-`rs#`:-pos) %>% + 1 %>% # Add one to get the number of 1 alleles at each site rowMeans(na.rm = T) %>% # Divide by 2 to get the frequency {. / 2} %>% sapply(FUN = function(freq) min(freq, 1 - freq)) hist(snp.maf1) ## Remeasure heterozygosity snp.het1 <- s2c15_gbs_hmp2 %>% select(-`rs#`:-pos) %>% apply(MARGIN = 1, FUN = function(snp) sum(snp == 0, na.rm = T) / sum(!is.na(snp))) hist(snp.het1) ## Measure entry missingness entry.missing <- s2c15_gbs_hmp2 %>% select(-`rs#`:-pos) %>% summarize_each(funs = funs(mean(is.na(.))) ) %>% as.numeric() hist(entry.missing) ## Measure entry heterozygosity entry.het <- s2c15_gbs_hmp2 %>% select(-`rs#`:-pos) %>% summarise_each(funs = funs(sum(. == 0, na.rm = T) / sum(!is.na(.)))) %>% as.numeric() hist(entry.het) ## Filter entries for missingness and heterozygosity - then save entry.to.remove <- which(entry.missing > 0.80 | entry.het > 0.50) # Looks like we lost 10 entries length(entry.to.remove) # Pull out the lines lines.to.remove <- s2c15_gbs_hmp2 %>% select(-`rs#`:-pos) %>% names() %>% .[entry.to.remove] # Filter s2c15_gbs_hmp3 <- s2c15_gbs_hmp2 %>% select(which(!names(.) %in% lines.to.remove)) # Save this file # This file should be saved in the `proc_geno_dir` directory (see above). write_tsv(x = s2c15_gbs_hmp3, path = file.path(proc_geno_dir, "s2c15_filtered_genos_hmp.txt")) ## Combine the filtered marker sets # Read in the filtered hapmap datasets # This file should be saved in the `proc_geno_dir` directory (see above). s2tp_hmp <- read_tsv(file = file.path(proc_geno_dir, "s2tp_filtered_genos_hmp.txt"), progress = F) s2c1_hmp <- read_tsv(file = file.path(proc_geno_dir, "s2c1_filtered_genos_hmp.txt"), progress = F) s2c15_hmp <- read_tsv(file = file.path(proc_geno_dir, "s2c15_filtered_genos_hmp.txt"), progress = F) # Combine on marker name combined_hmp <- Reduce(full_join, list(s2tp_hmp, s2c1_hmp, s2c15_hmp)) # combined_hmp <- full_join(s2tp_hmp, s2c1_hmp) ## Sort # Sort on chromosome and position combined_hmp1 <- combined_hmp %>% arrange(chrom, pos) %>% mutate(chrom = replace(chrom, chrom == "UNK", 8)) # Save as one file # This file should be saved in the `proc_geno_dir` directory (see above). write_tsv(x = combined_hmp1, path = file.path(proc_geno_dir, "s2_filtered_genos_hmp.txt"))