This pipeline consolidates results from various fine-mapping tools to uniform format, add rsID as necessary, and perform a simple "liftover" via rsID (not the formal UCSC liftOver
) to generate output in HG37 and HG38 builds.
This pipeline was devised by Gao Wang (UChicago), with core implemention done by Kushal Dey (Harvard).
where snp_id
will be rsID if some annotation files on rsID are provided and the update_variant_id
workflow below was executed. Otherwise it will take the default format of chr:pos:ref:alt
.
Because each fine-mapping software can have different output, the columns after the first 6 are arbitary. For SuSiE for example they are
chr pos ref alt variant_id locus_id pip cs cs_size cs_purity
For CAVIAR they are
chr pos ref alt variant_id locus_id pip cs_size
Naturally per locus cs
related information will be redundant. This is one draw-back to using a flat table output format.
Here we also provide additional routines to process the data,
variant_id
column using external annotations, for example by rsID.Notice that only the first 5 columns are necessary for these additional operations. The columns after the fifth can be arbitary and will be kept during the process.
--id-map-prefix
and --id-map-suffix
have to be valid.--coordinate-map-prefix
and --coordinate-map-suffix
have to be valid.%cd ~/GIT/github/fine-mapping
sos run workflow/finemapping_results_wrangler.ipynb -h
[global]
# Loci file
parameter: loci = path()
# summary statistics file prefix
parameter: ss_data_prefix = path()
# identifier for fine-mapping results to be extracted
parameter: pattern = "uniform.SuSiE_B.L_5.prior_0p005.res_var_false"
fail_if(not loci.is_file(), msg = 'Please specify valid path for --loci')
fail_if(ss_data_prefix.is_file(), msg = '--ss-data-prefix should be a path not a file (usually file without extension, if using input from my data wrangling pipeline)')
ss_data_prefix = ss_data_prefix.absolute()
chunks = [x.strip().split() for x in open(f'{loci:a}').readlines() if not x.strip().startswith('#')]
chunks = [x[3] if len(x) == 4 else "%s_%s_%s" % (x[0], x[1], x[2]) for x in chunks]
data = [f'{ss_data_prefix}/{x}/{x}.summary_stats.gz' for x in chunks]
if 'SuSiE' in pattern:
source = 'susie'
elif 'CAVIAR' in pattern:
source = 'caviar'
elif 'DAP' in pattern:
source = 'dap'
elif 'FINEMAP' in pattern:
source = 'finemap'
else:
raise ValueError("Invalid --pattern specification")
[default]
sos_run('consolidate+update_variant_id+update_coordinate')
# Consolidate fine-mapping results to a single file
[consolidate]
depends: R_library('rlang>=0.3.0'), R_library('dplyr'), R_library('data.table'), R_library('R.utils'), R_library('dscrutils') # can be installed via `devtools::install_github("stephenslab/dsc",subdir = "dscrutils", force = TRUE)`
# Keep PIP above these thresholds
parameter: pip_thresh = 0.05
# Round PIP to given digits
parameter: round_off = 6
ext = 'pkl' if source in ['dap'] else 'rds'
input: [x for x in paths([f'{ss_data_prefix}/{c}/{c}.{pattern}.{ext}' for c in chunks]) if x.is_file()]
output: f"{ss_data_prefix}.{pattern}.{loci:bn}.gz"
fail_if(len(_input) == 0, msg = f'Cannot find valid input files for given loci')
R: expand = "${ }", workdir = ss_data_prefix, stdout = f'{_output:n}.log'
# Here we define get_*_output functions for different output format
get_susie_output = function(unit, rds_file) {
cs_id = cs_size = cs_purity = rep(NA, length(rds_file$var_names))
num_cs = length(rds_file$sets$cs)
for(id in 1:num_cs){
idx = rds_file$sets$cs[[id]]
cs_id[idx] = names(rds_file$sets$cs)[id]
cs_size[idx] = length(rds_file$sets$cs[[id]])
cs_purity[idx] = rds_file$sets$purity[id,1]
}
out = cbind.data.frame(rep(unit, length(rds_file$var_names)),
rds_file$var_names,
rds_file$pip, cs_id, cs_size, cs_purity)
colnames(out) = c("locus_id", "variant_id", "pip", "cs", "cs_size", "cs_purity")
out[which(out[,3] >= ${pip_thresh} | !is.na(out[,4])), ]
}
get_caviar_output = function(unit, rds_file) {
snp = rds_file$snp
snp = snp[match(rds_file$var_names, snp$snp),]
cs_annot = rep(0, length(rds_file$var_names))
cs_annot[match(rds_file$set, rds_file$snp$snp)] = 1
out = cbind.data.frame(rep(unit, length(rds_file$var_names)),
rds_file$var_names,
snp$snp_prob,
cs_annot)
colnames(out) = c("gene_id", "var_id", "pip", "cs")
out[which(out[,3] >= ${pip_thresh} | out[,4] > 0), ]
}
get_dap_output = function(unit, rds_file) {
out = cbind.data.frame(rep(unit, length(rds_file$var_names)),
rds_file$var_names,
rds_file$snp$snp_prob,
rds_file$snp$cluster)
colnames(out) = c("gene_id", "var_id", "pip", "cs")
out[which(out[,3] >= ${pip_thresh} | out[,4] > 0),]
}
get_finemap_output = function(unit, rds_file) {
snp = rds_file$snp
snp = snp[match(rds_file$var_names, snp$snp),]
n = sum(cumsum(rds_file$set$config_prob) < 0.95) + 1
if(n > nrow(rds_file$set)){
rds_file$set = 0
}else{
rds_file$set = unlist(lapply(1:n, function(i) strsplit(as.character(rds_file$set[i,2]), ",")[[1]]))
rds_file$set = unique(as.vector(unlist(rds_file$set)))
}
cs_annot = rep(0, length(rds_file$var_names))
cs_annot[match(rds_file$set, rds_file$snp$snp)] = 1
out = cbind.data.frame(rep(unit, length(rds_file$var_names)),
rds_file$var_names,
snp$snp_prob,
cs_annot)
colnames(out) = c("gene_id", "var_id", "pip", "cs")
out[which(out[,3] >= ${pip_thresh} | out[,4] > 0), ]
}
is_float = function(x) {is.numeric(x) && !is.integer(x)}
# Data extraction script
suppressMessages(library(dplyr))
suppressMessages(library(data.table))
files = c(${paths([x.relative_to(ss_data_prefix) for x in _input]):r,})
processed_dat = list()
idx = 0
for (f in files) {
idx = idx + 1
rds_file = dscrutils::read_dsc(f)
unit = dirname(f)
processed_dat[[idx]] = get_${source}_output(unit, rds_file)
cat("We are at unit #", idx, ":", unit, "\n")
}
processed_dat = data.frame(rbindlist(processed_dat))
extract_coord = data.frame(do.call(rbind, lapply(processed_dat[,2], function(x) return(strsplit(as.character(x), ":")[[1]]))))
df = data.frame("chr" = extract_coord[,1], "pos" = extract_coord[,2], "ref" = extract_coord[,3], "alt" = extract_coord[,4],
"variant_id" = processed_dat[,2], "locus_id" = processed_dat[,1])
df = cbind.data.frame(df, processed_dat[, 3:ncol(processed_dat)])
df %>% mutate_if(is_float, ~round(., ${round_off})) -> df
df_sorted = df [order(df[,1], df[,2]),]
fwrite(df_sorted, file = "${_output:n}.tmp", sep = "\t", quote=FALSE, row.names=FALSE)
R.utils::gzip("${_output:n}.tmp", destname=${_output:r}, overwrite=TRUE, remove=TRUE)
# Update variant ID based on chrom and pos, from per chrom files (optional)
[update_variant_id]
depends: R_library('rlang>=0.3.0'), R_library('dplyr'), R_library('R.utils'), R_library('data.table')
# Path containing files for variant ID update rule
# Each file is a separate chromsome
parameter: id_map_prefix = path()
parameter: id_map_suffix = 'bim'
# columns first element for variant ID, 2nd element for genomic position
# is [2,4] for BIM files
parameter: columns = [2,4]
# chromosome identifiers
parameter: chroms = [x+1 for x in range(22)]
import glob
skip_if(len(glob.glob(f"{id_map_prefix:a}.*.{id_map_suffix}")) == 0, msg = 'Variant ID are not updated because no valid file is found using --id-map-prefix and --id-map-suffix')
output: f'{_input:n}.id_updated.gz'
R: expand = "${ }", stdout = f'{_output:n}.log'
suppressMessages(library(dplyr))
suppressMessages(library(data.table))
out = data.frame(fread("zcat ${_input}"))
out %>% mutate_if(is.factor, as.character) -> out
chroms = c(${paths(chroms):r,})
for(numchr in chroms){
which_chr = which(out$chr == numchr)
out_sub = out[which_chr, ]
dbfile = data.frame(fread(paste0("${id_map_prefix}.", numchr, ".${id_map_suffix}")))
out_sub_new = out_sub
idx1 = match(out_sub$pos, dbfile$V${columns[1]})
idx2 = idx1[which(!is.na(idx1))]
idx3 = 1:length(out_sub$pos)
idx4 = idx3[which(!is.na(idx1))]
out_sub_new[idx4, "variant_id"] = dbfile[idx2,"V${columns[0]}"]
out[which_chr, ] = out_sub_new
cat("Variant IDs updated for chromosome", numchr, "\n")
}
fwrite(out, file = "${_output:n}.tmp", sep = "\t", quote=FALSE, row.names=FALSE)
R.utils::gzip("${_output:n}.tmp", destname=${_output:r}, overwrite=TRUE, remove=TRUE)
# Update genomic coordinates based on variant ID, from per chrom files (optional)
[update_coordinate]
depends: R_library('rlang>=0.3.0'), R_library('dplyr'), R_library('data.table'), R_library('R.utils')
# Path containing files for coordinate update rule
# Each file is a separate chromsome
parameter: coordinate_map_prefix = path()
parameter: coordinate_map_suffix = 'bim'
# coordinate identifier
parameter: coordinate_version_id = 'hgX'
# columns first element for variant ID, 2nd element for genomic position
# 3rd for reference allele and 4th for alternative allele
# is [2,4,5,6] for BIM files
parameter: columns = [2,4,5,6]
# chromosome identifiers
parameter: chroms = [x+1 for x in range(22)]
import glob
skip_if(len(glob.glob(f"{coordinate_map_prefix:a}.*.{coordinate_map_suffix}")) == 0, msg = 'Genomic coordinates are not updated because no valid file is found using --coordinate-map-prefix and --coordinate-map-suffix')
output: f'{_input:n}.{coordinate_version_id}.gz'.replace('.id_updated.', '.')
R: expand = "${ }", stdout = f'{_output:n}.log'
suppressMessages(library(dplyr))
suppressMessages(library(data.table))
out = data.frame(fread("zcat ${_input}"))
out %>% mutate_if(is.factor, as.character) -> out
chroms = c(${paths(chroms):r,})
out2 = c()
for(numchr in chroms){
dbfile = data.frame(fread(paste0("${coordinate_map_prefix}.", numchr, ".${coordinate_map_suffix}")))
out2_sub = out[which(out$chr == numchr), ]
idx1 = match(out2_sub$variant_id, dbfile$V${columns[0]})
idx2 = idx1[which(!is.na(idx1))]
idx3 = 1:length(out2_sub$variant_id)
idx4 = idx3[which(!is.na(idx1))]
out2_sub[idx4, c("pos", "ref", "alt")] = dbfile[idx2, c("V${columns[1]}", "V${columns[2]}", "V${columns[3]}")]
out2 = rbind.data.frame(out2, out2_sub[idx4,])
cat("Genomic coordinate updated for chromosome", numchr, "\n")
}
fwrite(out2, file = "${_output:n}.tmp", sep = "\t", quote=FALSE, row.names=FALSE)
R.utils::gzip("${_output:n}.tmp", destname=${_output:r}, overwrite=TRUE, remove=TRUE)
sos run workflow/finemapping_results_wrangler.ipynb \
--ss-data-prefix ~/tmp/01-Jan-2019 \
--pattern uniform.SuSiE_B.L_5.prior_0p005.res_var_false \
--id-map-prefix /project2/mstephens/SuSIE_gtex_CPP/rsID_map/Hg38/1000G.EUR.hg38 \
--coordinate-map-prefix /project2/mstephens/SuSIE_gtex_CPP/rsID_map/Hg37/1000G.EUR.QC \
--coordinate-version-id hg37
Exported from workflow/finemapping_results_wrangler.ipynb
committed by minqiao on Mon Jul 1 18:51:56 2019 revision 27, 69d4049