54 lines 2.7 kB
            #!/usr/bin/env sos-runner
# fileformat=SOS1.0
# Author: Kushal Dey and Gao Wang

# This is a pipeline to convert fine-mapping results to
# LDSC input. The fine-mapping results are generated by
# the `finemapping_results_wrangler.ipynb` pipeline
# To see how it works, please run `./finemapping_to_ldsc.sos -h`

[global]
# Input data file, the `.gz` file from `finemapping_results_wrangler.ipynb` output
parameter: data = path()
# Chromosomes to work on
parameter: chroms = [x+1 for x in range(22)]
# Prefix for the BIM reference file
parameter: reference_snp_prefix = path()
# Output directory
parameter: output_dir = path("somewhere/over_the_rainbow")

fail_if(not data.is_file(), msg = "Please specify valid data file via --data")
fail_if(len(glob.glob(f"{reference_snp_prefix:a}.*.bim")) == 0, msg = 'Please specify valid --reference-snp-prefix')

[default]
depends: R_library('data.table'), R_library('R.utils')
input: data, for_each = 'chroms'
output: f"{output_dir:a}/{output_dir:b}.{_chroms}.annot.gz"
task: trunk_workers = 1, trunk_size = 1, walltime = '3m', mem = '2G', cores = 1, tags = f'{step_name}_{_output:bn}'
R: expand = "${ }"
    library(data.table)
    postprocessed_dat = data.frame(fread("zcat ${_input}"))
    numchr = ${_chroms}
    chr_idx = which(postprocessed_dat$chr == numchr)
    if(length(chr_idx) > 0){
    postprocessed_dat_chr = postprocessed_dat[which(postprocessed_dat$chr == numchr),]
    extract_ids_sub = postprocessed_dat_chr$variant_id
    unique_ids = unique(extract_ids_sub)
    max_pip = c()
    for(uid in 1:length(unique_ids)){
        max_pip = c(max_pip, max(postprocessed_dat_chr[which(extract_ids_sub == unique_ids[uid]),"pip"]))
    }
    pre_annot_maxpip = cbind.data.frame(rep(numchr, length(max_pip)), unique_ids, max_pip)
    colnames(pre_annot_maxpip) = c("CHR", "SNP", "maxcpp")
    bimfile = data.frame(fread(paste0("${reference_snp_prefix}.", numchr, ".bim")))
    annot_new = cbind.data.frame(bimfile[,1], bimfile[,4], bimfile[,2], bimfile[,3], rep(0, dim(bimfile)[1]))
    colnames(annot_new) = c("CHR", "BP", "SNP", "CM", "AN")
    common_snps = intersect(pre_annot_maxpip$SNP, annot_new$SNP)
    annot_new[match(common_snps, annot_new$SNP),5] = pre_annot_maxpip[match(common_snps, pre_annot_maxpip$SNP), "maxcpp"]
    }else{
    bimfile = data.frame(fread(paste0("${reference_snp_prefix}.", numchr, ".bim")))
    annot_new = cbind.data.frame(bimfile[,1], bimfile[,4], bimfile[,2], bimfile[,3], rep(0, dim(bimfile)[1]))
    colnames(annot_new) = c("CHR", "BP", "SNP", "CM", "AN")
    }
    fwrite(annot_new, file = "${_output:n}.tmp", sep = "\t", quote=FALSE, row.names=FALSE)
    R.utils::gzip("${_output:n}.tmp", destname=${_output:r}, overwrite=TRUE, remove=TRUE)