finemapping_to_ldsc.sos
54 lines
2.7 kB
#!/usr/bin/env sos-runner # fileformat=SOS1.0 # Author: Kushal Dey and Gao Wang # This is a pipeline to convert fine-mapping results to # LDSC input. The fine-mapping results are generated by # the `finemapping_results_wrangler.ipynb` pipeline # To see how it works, please run `./finemapping_to_ldsc.sos -h` [global] # Input data file, the `.gz` file from `finemapping_results_wrangler.ipynb` output parameter: data = path() # Chromosomes to work on parameter: chroms = [x+1 for x in range(22)] # Prefix for the BIM reference file parameter: reference_snp_prefix = path() # Output directory parameter: output_dir = path("somewhere/over_the_rainbow") fail_if(not data.is_file(), msg = "Please specify valid data file via --data") fail_if(len(glob.glob(f"{reference_snp_prefix:a}.*.bim")) == 0, msg = 'Please specify valid --reference-snp-prefix') [default] depends: R_library('data.table'), R_library('R.utils') input: data, for_each = 'chroms' output: f"{output_dir:a}/{output_dir:b}.{_chroms}.annot.gz" task: trunk_workers = 1, trunk_size = 1, walltime = '3m', mem = '2G', cores = 1, tags = f'{step_name}_{_output:bn}' R: expand = "${ }" library(data.table) postprocessed_dat = data.frame(fread("zcat ${_input}")) numchr = ${_chroms} chr_idx = which(postprocessed_dat$chr == numchr) if(length(chr_idx) > 0){ postprocessed_dat_chr = postprocessed_dat[which(postprocessed_dat$chr == numchr),] extract_ids_sub = postprocessed_dat_chr$variant_id unique_ids = unique(extract_ids_sub) max_pip = c() for(uid in 1:length(unique_ids)){ max_pip = c(max_pip, max(postprocessed_dat_chr[which(extract_ids_sub == unique_ids[uid]),"pip"])) } pre_annot_maxpip = cbind.data.frame(rep(numchr, length(max_pip)), unique_ids, max_pip) colnames(pre_annot_maxpip) = c("CHR", "SNP", "maxcpp") bimfile = data.frame(fread(paste0("${reference_snp_prefix}.", numchr, ".bim"))) annot_new = cbind.data.frame(bimfile[,1], bimfile[,4], bimfile[,2], bimfile[,3], rep(0, dim(bimfile)[1])) colnames(annot_new) = c("CHR", "BP", "SNP", "CM", "AN") common_snps = intersect(pre_annot_maxpip$SNP, annot_new$SNP) annot_new[match(common_snps, annot_new$SNP),5] = pre_annot_maxpip[match(common_snps, pre_annot_maxpip$SNP), "maxcpp"] }else{ bimfile = data.frame(fread(paste0("${reference_snp_prefix}.", numchr, ".bim"))) annot_new = cbind.data.frame(bimfile[,1], bimfile[,4], bimfile[,2], bimfile[,3], rep(0, dim(bimfile)[1])) colnames(annot_new) = c("CHR", "BP", "SNP", "CM", "AN") } fwrite(annot_new, file = "${_output:n}.tmp", sep = "\t", quote=FALSE, row.names=FALSE) R.utils::gzip("${_output:n}.tmp", destname=${_output:r}, overwrite=TRUE, remove=TRUE)