A procedure useful to create toy data from data bundle for methods development and closer look at real data of interest.
As an example we focus on genes selected due to the LD show-case table in the mash paper: see this table for motivation that these genes get selected.
Note that this workflow itself can take an arbitary list of genes down the line, via
sos run this_notebook.ipynb extract_gene_data --glist /path/to/genelist.txt
This can be useful to select genes of interest for specific analysis. See implementation below.
[global]
parameter: cwd = '~/Documents/GTEx'
parameter: glist = '~/Documents/GIT/github/gtex-eqtls/data/3mashgenes.txt'
rna_cnts = "~/Documents/GTEx/gtex7/rna-seq/GTEx_Data_2016-01-15_v7_RNA-seq_RNA-SeQCv1.1.8_gene_reads.gct.gz"
parameter: ann_file = "${cwd!a}/h5_formatted/${rna_cnts!bn}.annotation"
parameter: geno_file = "${cwd!a}/h5_formatted/GTEx7.Imputed.genotyped.filtered.cis.h5"
parameter: expr_file = "${cwd!a}/rna_processed/GTEx_Data_2016-01-15_v7_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.qnorm.std.h5"
[extract_gene_data]
# Given gene list, extract SNP and expression data from formatted database
input: geno_file, expr_file
output: "${cwd!a}/ToyExample/${glist!bn}.genotype.h5", "${cwd!a}/ToyExample/${glist!bn}.expr.h5"
task: workdir = cwd
python:
import pandas as pd
import os
# load gene annotation
ann = pd.read_csv(${ann_file!ar}, header = None, names = ['chr', 'gene'], sep = ' ', usecols = (0,3))
ann = {g:c for g, c in zip(ann['gene'], ann['chr'])}
# load gene list
genes = pd.read_csv(${glist!ar}, header = None)[0]
# purge existing toy
gfile = ${output[0]!r}
efile = ${output[1]!r}
if os.path.isfile(gfile):
os.remove(gfile)
if os.path.isfile(efile):
os.remove(efile)
# build & save genotypes
for g in genes:
g = g.split('.')[0]
if g not in ann:
print("Gene {} not found".format(g))
continue
chrom = ann[g]
geno = pd.read_hdf(${input[0]!r}, 'chr{}/{}'.format(chrom, g))
geno.to_hdf(gfile, 'chr{}/{}'.format(chrom, g), mode = 'a', complevel = 9, complib = 'zlib')
# extract & save expression
for key in pd.HDFStore(${input[1]!r}).keys():
expr = pd.read_hdf(${input[1]!r}, key).loc[genes]
expr.to_hdf(efile, key, mode = 'a', complevel = 9, complib = 'zlib')
%sosrun extract_gene_data
%sessioninfo