This generates per gene data-set and surveys how many SNPs in each generated dataset.
%revisions -s -n 10
[global]
parameter: cwd = path('~/Documents/GTExV8')
parameter: genotype_file = f"{cwd:a}/GTExV7.genotype.cis.h5"
parameter: expr_file = f"{cwd:a}/GTExV7.expression.h5"
parameter: cov_file = f"{cwd:a}/GTExV7.covariates.h5"
parameter: outdir = f'{cwd:a}/Toys'
# we can extract single or multiple tissue data
parameter: tissues = []
# list of genes to extract data for
parameter: genes = []
# if list of genes is not provided we'll get from a fixed file `n` genes
parameter: n = 200
tissues_default = '''
Adipose_Subcutaneous
Adipose_Visceral_Omentum
Adrenal_Gland
Artery_Aorta
Artery_Coronary
Artery_Tibial
Brain_Amygdala
Brain_Anterior_cingulate_cortex_BA24
Brain_Caudate_basal_ganglia
Brain_Cerebellar_Hemisphere
Brain_Cerebellum
Brain_Cortex
Brain_Frontal_Cortex_BA9
Brain_Hippocampus
Brain_Hypothalamus
Brain_Nucleus_accumbens_basal_ganglia
Brain_Putamen_basal_ganglia
Brain_Spinal_cord_cervical_c-1
Brain_Substantia_nigra
Breast_Mammary_Tissue
Cells_Cultured_fibroblasts
Cells_EBV-transformed_lymphocytes
Colon_Sigmoid
Colon_Transverse
Esophagus_Gastroesophageal_Junction
Esophagus_Mucosa
Esophagus_Muscularis
Heart_Atrial_Appendage
Heart_Left_Ventricle
Kidney_Cortex
Liver
Lung
Minor_Salivary_Gland
Muscle_Skeletal
Nerve_Tibial
Ovary
Pancreas
Pituitary
Prostate
Skin_Not_Sun_Exposed_Suprapubic
Skin_Sun_Exposed_Lower_leg
Small_Intestine_Terminal_Ileum
Spleen
Stomach
Testis
Thyroid
Uterus
Vagina
Whole_Blood
'''
if len(tissues) == 0:
tissues = tissues_default.strip().split("\n")
[extract]
if len(genes) == 0:
genes = get_output(f'shuf {cwd:a}/all_genes.txt | sort -u | head -{n} ').strip().split('\n')
tissue_tables = 'c(' + ','.join([f'"/{item}"' for item in tissues]) + ')'
depends: R_library('rhdf5'), R_library('plyr')
input: for_each = 'genes', concurrent = True
output: f"{outdir}/{tissues[0] if len(tissues) == 1 else 'Multi_Tissues'}.{path(_genes):b}.RDS"
R: expand = '${ }'
library(rhdf5)
load_genotype = function(genotype_file, geno_table) {
geno = h5read(genotype_file, geno_table)
gdata = data.frame(geno$block0_values)
colnames(gdata) = geno$axis1
rownames(gdata) = geno$axis0
return(gdata)
}
load_expression = function(expr_file, expr_table, gene) {
expr = h5read(expr_file, expr_table)
edata = data.frame(expr$block0_values)
colnames(edata) = tools::file_path_sans_ext(expr$axis1)
edata = data.frame(edata[, gene])
rownames(edata) = expr$axis0
colnames(edata) = basename(expr_table)
return(edata)
}
load_covariates = function(cov_file, cov_table) {
covariate = h5read(cov_file, cov_table)
cdata = data.frame(covariate$block0_values)
colnames(cdata) = covariate$axis1
rownames(cdata) = covariate$axis0
cdata = t(cdata)
return(cdata)
}
load_data_single = function(genotype_file, expr_file, cov_file, geno_table, expr_table, cov_table) {
gdata = load_genotype(genotype_file, geno_table)
edata = load_expression(expr_file, expr_table, basename(geno_table))
cdata = load_covariates(cov_file, cov_table)
# reorder
edata = edata[match(rownames(cdata), rownames(edata)),]
gdata = gdata[rownames(cdata),]
return(list(X = as.matrix(gdata), y = as.vector(edata), Z = as.matrix(cdata), chrom = strsplit(colnames(gdata)[1], '_')[[1]][1],
pos = as.integer(do.call(rbind, strsplit(colnames(gdata), '_'))[,2])))
}
load_data_multi = function(genotype_file, expr_file, cov_file, geno_table, expr_tables, cov_tables) {
gdata = load_genotype(genotype_file, geno_table)
edata = lapply(1:length(expr_tables), function(i) load_expression(expr_file, expr_tables[[i]], basename(geno_table)))
cdata = lapply(1:length(cov_tables), function(i) load_covariates(cov_file, cov_tables[[i]]))
y_res = list()
# reorder and remove covariates
for (i in 1:length(edata)) {
idx = match(rownames(cdata[[i]]), rownames(edata[[i]]))
row_str = rownames(edata[[i]])[idx]
col_str = colnames(edata[[i]])
edata[[i]] = edata[[i]][idx,]
Z = scale(cdata[[i]], center=TRUE, scale=FALSE)
y_res[[i]] = .lm.fit(Z, edata[[i]])$residuals
y_res[[i]] = data.frame(matrix(y_res[[i]], length(y_res[[i]]), 1))
y_res[[i]] = cbind(row_str, y_res[[i]])
colnames(y_res[[i]]) = c('sample.id', col_str)
}
# merge residuals into one matrix
# need to ensure same rows
# are matched and unmatched entries are filled with NA
y_res_mat = plyr::join_all(y_res, by = "sample.id", type = "full", match = "all")
y_res_mat = y_res_mat[match(rownames(gdata), y_res_mat$sample.id),]
rownames(y_res_mat) = y_res_mat$sample.id
y_res_mat = y_res_mat[,-1]
# and match its order with genotype data
#gdata = gdata[rownames(y_res_mat),]
return(list(X = as.matrix(gdata), y = edata, Z = cdata, y_res = y_res_mat,
chrom = strsplit(colnames(gdata)[1], '_')[[1]][1],
pos = as.integer(do.call(rbind, strsplit(colnames(gdata), '_'))[,2])))
}
#
geno_table = "${_genes}"
expr_table = cov_table = ${tissue_tables}
if (length(expr_table) == 1) {
dat = load_data_single("${genotype_file}", "${expr_file}", "${cov_file}", geno_table, expr_table, cov_table)
} else {
dat = load_data_multi("${genotype_file}", "${expr_file}", "${cov_file}", geno_table, expr_table, cov_table)
}
saveRDS(dat, ${_output:r})
For (one) example,
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr4/ENSG00000145214
To get an idea how many variants are there:
[count_var]
input: glob.glob('~/Documents/GTExV8/Toys/*.RDS'), group_by = 1
output: '~/Documents/GTExV8/Toys/counts.txt'
R: expand = True, stdout = _output
cat(paste0({_input:br}, ncol(readRDS({_input:r})$X),'\n'))
%cd ~/Documents/GTExV8/Toys
dat = scan('counts.txt')
summary(dat)
For all the 27 genes in question in MASH paper,
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr20/ENSG00000025772
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr17/ENSG00000056661
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr2/ENSG00000064012
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr16/ENSG00000089486
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr8/ENSG00000104472
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr17/ENSG00000108384
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr5/ENSG00000112977
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr17/ENSG00000120088
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr1/ENSG00000135744
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr3/ENSG00000136059
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr15/ENSG00000140265
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr4/ENSG00000145214
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr11/ENSG00000149054
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr1/ENSG00000160766
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr4/ENSG00000164124
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr12/ENSG00000177084
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr19/ENSG00000181240
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr17/ENSG00000187824
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr7/ENSG00000188732
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr1/ENSG00000189171
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr7/ENSG00000189316
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr15/ENSG00000198794
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr2/ENSG00000225439
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr3/ENSG00000249846
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr18/ENSG00000264247
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr19/ENSG00000267508
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr1/ENSG00000272030