Adapted from my eQTL analysis pipeline for generating summary stats for use with other projects.
cwd = path('/home/gaow/Documents/GTExV8/')
prefix = 'GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm'
%expand
import pandas as pd
import numpy as np
import re, os
def load_data(fdata, fsample, dtype = np.float32):
'''First col of expression data is ENCODE gene name, 2nd col is HUGO name'''
head = pd.read_csv(fdata, skiprows = 2, sep = '\t', nrows = 1)
dt = dict([('Description', str), ('Name', str)])
dt.update(dict([(x, dtype) for x in head.columns if x not in dt]))
data = pd.read_csv(fdata, compression='gzip', skiprows=2,
index_col=0, header=0, dtype = dt, sep='\t')#.drop('Description', 1)
samples = pd.read_csv(fsample, dtype=str, delimiter='\t', header=0)
sample_dict = dict()
for row in samples[['SAMPID', 'SMTSD', 'SMAFRZE']].values:
if row[2] == 'EXCLUDE':
continue
if row[1] not in sample_dict:
sample_dict[row[1]] = []
if row[0] in data.columns:
sample_dict[row[1]].append(row[0])
return data, dict((re.sub("[\W\d]+", "_", k.strip()).strip('_'), v) for k, v in sample_dict.items() if len(v))
def calc_expr_sumstats(data, sample, gene):
mu_hat = dict([('Description', data.loc[gene, 'Description'])])
sigma_hat = dict([('Description', data.loc[gene, 'Description'])])
median = dict([('Description', data.loc[gene, 'Description'])])
n = dict([('Description', data.loc[gene, 'Description'])])
for k in sample:
mu_hat[k] = np.mean(data.loc[gene, sample[k]])
median[k] = np.median(data.loc[gene, sample[k]])
sigma_hat[k] = np.std(data.loc[gene, sample[k]])
n[k] = len(data.loc[gene, sample[k]])
return mu_hat, median, sigma_hat, n
data, sample = load_data('{cwd:a}/rna_seq/{prefix}.gct.gz',
'{cwd:a}/sample_annotations/GTEx_Analysis_2017-06-05_v8_Annotations_SampleAttributesDS.txt')
# for all genes
mu_hat = pd.DataFrame()
median = pd.DataFrame()
sigma_hat = pd.DataFrame()
sample_size = pd.DataFrame()
for count, gene in enumerate(data.index):
sumstats = calc_expr_sumstats(data, sample, gene)
tmp = pd.DataFrame(dict((k,[v]) for k, v in sumstats[0].items()), columns=sumstats[0])
mu_hat = mu_hat.append(tmp.set_index([[gene]]))
tmp = pd.DataFrame(dict((k,[v]) for k, v in sumstats[1].items()), columns=sumstats[1])
median = median.append(tmp.set_index([[gene]]))
tmp = pd.DataFrame(dict((k,[v]) for k, v in sumstats[2].items()), columns=sumstats[2])
sigma_hat = sigma_hat.append(tmp.set_index([[gene]]))
tmp = pd.DataFrame(dict((k,[v]) for k, v in sumstats[3].items()), columns=sumstats[3])
sample_size = sample_size.append(tmp.set_index([[gene]]))
if count % 1000 == 0:
print(count)
mu_hat.to_csv('{cwd:a}/rna_seq/{prefix}.mean.csv.gz', na_rep = 'NA', compression = 'gzip')
median.to_csv('{cwd:a}/rna_seq/{prefix}.median.csv.gz', na_rep = 'NA', compression = 'gzip')
sigma_hat.to_csv('{cwd:a}/rna_seq/{prefix}.se.csv.gz', na_rep = 'NA', compression = 'gzip')
sample_size.to_csv('{cwd:a}/rna_seq/{prefix}.sample_size.csv.gz', na_rep = 'NA', compression = 'gzip')