This pipeline consolidates results from various fine-mapping tools to uniform format, add rsID as necessary, and perform a simple "liftover" via rsID (not the formal UCSC liftOver) to generate output in HG37 and HG38 builds.
This pipeline was devised by Gao Wang (UChicago), with core implemention done by Kushal Dey (Harvard).
where snp_id will be rsID if some annotation files on rsID are provided and the update_variant_id workflow below was executed. Otherwise it will take the default format of chr:pos:ref:alt.
Because each fine-mapping software can have different output, the columns after the first 6 are arbitary. For SuSiE for example they are
chr pos ref alt variant_id locus_id pip cs cs_size cs_purity
For CAVIAR they are
chr pos ref alt variant_id locus_id pip cs_size
Naturally per locus cs related information will be redundant. This is one draw-back to using a flat table output format.
Here we also provide additional routines to process the data,
variant_id column using external annotations, for example by rsID.Notice that only the first 5 columns are necessary for these additional operations. The columns after the fifth can be arbitary and will be kept during the process.
--id-map-prefix and --id-map-suffix have to be valid.--coordinate-map-prefix and --coordinate-map-suffix have to be valid.%cd ~/GIT/github/fine-mapping
sos run workflow/finemapping_results_wrangler.ipynb -h
[global]
# Loci file
parameter: loci = path()
# summary statistics file prefix
parameter: ss_data_prefix = path()
# identifier for fine-mapping results to be extracted
parameter: pattern = "uniform.SuSiE_B.L_5.prior_0p005.res_var_false"
fail_if(not loci.is_file(), msg = 'Please specify valid path for --loci')
fail_if(ss_data_prefix.is_file(), msg = '--ss-data-prefix should be a path not a file (usually file without extension, if using input from my data wrangling pipeline)')
ss_data_prefix = ss_data_prefix.absolute()
chunks = [x.strip().split() for x in open(f'{loci:a}').readlines() if not x.strip().startswith('#')]
chunks = [x[3] if len(x) == 4 else "%s_%s_%s" % (x[0], x[1], x[2]) for x in chunks]
data = [f'{ss_data_prefix}/{x}/{x}.summary_stats.gz' for x in chunks]
if 'SuSiE' in pattern:
source = 'susie'
elif 'CAVIAR' in pattern:
source = 'caviar'
elif 'DAP' in pattern:
source = 'dap'
elif 'FINEMAP' in pattern:
source = 'finemap'
else:
raise ValueError("Invalid --pattern specification")
[default]
sos_run('consolidate+update_variant_id+update_coordinate')
# Consolidate fine-mapping results to a single file
[consolidate]
depends: R_library('rlang>=0.3.0'), R_library('dplyr'), R_library('data.table'), R_library('R.utils'), R_library('dscrutils') # can be installed via `devtools::install_github("stephenslab/dsc",subdir = "dscrutils", force = TRUE)`
# Keep PIP above these thresholds
parameter: pip_thresh = 0.05
# Round PIP to given digits
parameter: round_off = 6
ext = 'pkl' if source in ['dap'] else 'rds'
input: [x for x in paths([f'{ss_data_prefix}/{c}/{c}.{pattern}.{ext}' for c in chunks]) if x.is_file()]
output: f"{ss_data_prefix}.{pattern}.{loci:bn}.gz"
fail_if(len(_input) == 0, msg = f'Cannot find valid input files for given loci')
R: expand = "${ }", workdir = ss_data_prefix, stdout = f'{_output:n}.log'
# Here we define get_*_output functions for different output format
get_susie_output = function(unit, rds_file) {
cs_id = cs_size = cs_purity = rep(NA, length(rds_file$var_names))
num_cs = length(rds_file$sets$cs)
for(id in 1:num_cs){
idx = rds_file$sets$cs[[id]]
cs_id[idx] = names(rds_file$sets$cs)[id]
cs_size[idx] = length(rds_file$sets$cs[[id]])
cs_purity[idx] = rds_file$sets$purity[id,1]
}
out = cbind.data.frame(rep(unit, length(rds_file$var_names)),
rds_file$var_names,
rds_file$pip, cs_id, cs_size, cs_purity)
colnames(out) = c("locus_id", "variant_id", "pip", "cs", "cs_size", "cs_purity")
out[which(out[,3] >= ${pip_thresh} | !is.na(out[,4])), ]
}
get_caviar_output = function(unit, rds_file) {
snp = rds_file$snp
snp = snp[match(rds_file$var_names, snp$snp),]
cs_annot = rep(0, length(rds_file$var_names))
cs_annot[match(rds_file$set, rds_file$snp$snp)] = 1
out = cbind.data.frame(rep(unit, length(rds_file$var_names)),
rds_file$var_names,
snp$snp_prob,
cs_annot)
colnames(out) = c("gene_id", "var_id", "pip", "cs")
out[which(out[,3] >= ${pip_thresh} | out[,4] > 0), ]
}
get_dap_output = function(unit, rds_file) {
out = cbind.data.frame(rep(unit, length(rds_file$var_names)),
rds_file$var_names,
rds_file$snp$snp_prob,
rds_file$snp$cluster)
colnames(out) = c("gene_id", "var_id", "pip", "cs")
out[which(out[,3] >= ${pip_thresh} | out[,4] > 0),]
}
get_finemap_output = function(unit, rds_file) {
snp = rds_file$snp
snp = snp[match(rds_file$var_names, snp$snp),]
n = sum(cumsum(rds_file$set$config_prob) < 0.95) + 1
if(n > nrow(rds_file$set)){
rds_file$set = 0
}else{
rds_file$set = unlist(lapply(1:n, function(i) strsplit(as.character(rds_file$set[i,2]), ",")[[1]]))
rds_file$set = unique(as.vector(unlist(rds_file$set)))
}
cs_annot = rep(0, length(rds_file$var_names))
cs_annot[match(rds_file$set, rds_file$snp$snp)] = 1
out = cbind.data.frame(rep(unit, length(rds_file$var_names)),
rds_file$var_names,
snp$snp_prob,
cs_annot)
colnames(out) = c("gene_id", "var_id", "pip", "cs")
out[which(out[,3] >= ${pip_thresh} | out[,4] > 0), ]
}
is_float = function(x) {is.numeric(x) && !is.integer(x)}
# Data extraction script
suppressMessages(library(dplyr))
suppressMessages(library(data.table))
files = c(${paths([x.relative_to(ss_data_prefix) for x in _input]):r,})
processed_dat = list()
idx = 0
for (f in files) {
idx = idx + 1
rds_file = dscrutils::read_dsc(f)
unit = dirname(f)
processed_dat[[idx]] = get_${source}_output(unit, rds_file)
cat("We are at unit #", idx, ":", unit, "\n")
}
processed_dat = data.frame(rbindlist(processed_dat))
extract_coord = data.frame(do.call(rbind, lapply(processed_dat[,2], function(x) return(strsplit(as.character(x), ":")[[1]]))))
df = data.frame("chr" = extract_coord[,1], "pos" = extract_coord[,2], "ref" = extract_coord[,3], "alt" = extract_coord[,4],
"variant_id" = processed_dat[,2], "locus_id" = processed_dat[,1])
df = cbind.data.frame(df, processed_dat[, 3:ncol(processed_dat)])
df %>% mutate_if(is_float, ~round(., ${round_off})) -> df
df_sorted = df [order(df[,1], df[,2]),]
fwrite(df_sorted, file = "${_output:n}.tmp", sep = "\t", quote=FALSE, row.names=FALSE)
R.utils::gzip("${_output:n}.tmp", destname=${_output:r}, overwrite=TRUE, remove=TRUE)
# Update variant ID based on chrom and pos, from per chrom files (optional)
[update_variant_id]
depends: R_library('rlang>=0.3.0'), R_library('dplyr'), R_library('R.utils'), R_library('data.table')
# Path containing files for variant ID update rule
# Each file is a separate chromsome
parameter: id_map_prefix = path()
parameter: id_map_suffix = 'bim'
# columns first element for variant ID, 2nd element for genomic position
# is [2,4] for BIM files
parameter: columns = [2,4]
# chromosome identifiers
parameter: chroms = [x+1 for x in range(22)]
import glob
skip_if(len(glob.glob(f"{id_map_prefix:a}.*.{id_map_suffix}")) == 0, msg = 'Variant ID are not updated because no valid file is found using --id-map-prefix and --id-map-suffix')
output: f'{_input:n}.id_updated.gz'
R: expand = "${ }", stdout = f'{_output:n}.log'
suppressMessages(library(dplyr))
suppressMessages(library(data.table))
out = data.frame(fread("zcat ${_input}"))
out %>% mutate_if(is.factor, as.character) -> out
chroms = c(${paths(chroms):r,})
for(numchr in chroms){
which_chr = which(out$chr == numchr)
out_sub = out[which_chr, ]
dbfile = data.frame(fread(paste0("${id_map_prefix}.", numchr, ".${id_map_suffix}")))
out_sub_new = out_sub
idx1 = match(out_sub$pos, dbfile$V${columns[1]})
idx2 = idx1[which(!is.na(idx1))]
idx3 = 1:length(out_sub$pos)
idx4 = idx3[which(!is.na(idx1))]
out_sub_new[idx4, "variant_id"] = dbfile[idx2,"V${columns[0]}"]
out[which_chr, ] = out_sub_new
cat("Variant IDs updated for chromosome", numchr, "\n")
}
fwrite(out, file = "${_output:n}.tmp", sep = "\t", quote=FALSE, row.names=FALSE)
R.utils::gzip("${_output:n}.tmp", destname=${_output:r}, overwrite=TRUE, remove=TRUE)
# Update genomic coordinates based on variant ID, from per chrom files (optional)
[update_coordinate]
depends: R_library('rlang>=0.3.0'), R_library('dplyr'), R_library('data.table'), R_library('R.utils')
# Path containing files for coordinate update rule
# Each file is a separate chromsome
parameter: coordinate_map_prefix = path()
parameter: coordinate_map_suffix = 'bim'
# coordinate identifier
parameter: coordinate_version_id = 'hgX'
# columns first element for variant ID, 2nd element for genomic position
# 3rd for reference allele and 4th for alternative allele
# is [2,4,5,6] for BIM files
parameter: columns = [2,4,5,6]
# chromosome identifiers
parameter: chroms = [x+1 for x in range(22)]
import glob
skip_if(len(glob.glob(f"{coordinate_map_prefix:a}.*.{coordinate_map_suffix}")) == 0, msg = 'Genomic coordinates are not updated because no valid file is found using --coordinate-map-prefix and --coordinate-map-suffix')
output: f'{_input:n}.{coordinate_version_id}.gz'.replace('.id_updated.', '.')
R: expand = "${ }", stdout = f'{_output:n}.log'
suppressMessages(library(dplyr))
suppressMessages(library(data.table))
out = data.frame(fread("zcat ${_input}"))
out %>% mutate_if(is.factor, as.character) -> out
chroms = c(${paths(chroms):r,})
out2 = c()
for(numchr in chroms){
dbfile = data.frame(fread(paste0("${coordinate_map_prefix}.", numchr, ".${coordinate_map_suffix}")))
out2_sub = out[which(out$chr == numchr), ]
idx1 = match(out2_sub$variant_id, dbfile$V${columns[0]})
idx2 = idx1[which(!is.na(idx1))]
idx3 = 1:length(out2_sub$variant_id)
idx4 = idx3[which(!is.na(idx1))]
out2_sub[idx4, c("pos", "ref", "alt")] = dbfile[idx2, c("V${columns[1]}", "V${columns[2]}", "V${columns[3]}")]
out2 = rbind.data.frame(out2, out2_sub[idx4,])
cat("Genomic coordinate updated for chromosome", numchr, "\n")
}
fwrite(out2, file = "${_output:n}.tmp", sep = "\t", quote=FALSE, row.names=FALSE)
R.utils::gzip("${_output:n}.tmp", destname=${_output:r}, overwrite=TRUE, remove=TRUE)
sos run workflow/finemapping_results_wrangler.ipynb \
--ss-data-prefix ~/tmp/01-Jan-2019 \
--pattern uniform.SuSiE_B.L_5.prior_0p005.res_var_false \
--id-map-prefix /project2/mstephens/SuSIE_gtex_CPP/rsID_map/Hg38/1000G.EUR.hg38 \
--coordinate-map-prefix /project2/mstephens/SuSIE_gtex_CPP/rsID_map/Hg37/1000G.EUR.QC \
--coordinate-version-id hg37
Exported from workflow/finemapping_results_wrangler.ipynb committed by minqiao on Mon Jul 1 18:51:56 2019 revision 27, 69d4049