Multivariate Bayesian variable selection regression

Extract per gene dataset

This generates per gene data-set and surveys how many SNPs in each generated dataset.

In [1]:
%revisions -s -n 10
Revision Author Date Message
d4e8a8c Gao Wang 2018-05-24 Update documentation
In [ ]:
[global]
parameter: cwd = path('~/Documents/GTExV8')
parameter: genotype_file = f"{cwd:a}/GTExV7.genotype.cis.h5"
parameter: expr_file = f"{cwd:a}/GTExV7.expression.h5"
parameter: cov_file = f"{cwd:a}/GTExV7.covariates.h5"
parameter: outdir = f'{cwd:a}/Toys'
# we can extract single or multiple tissue data
parameter: tissues = []
# list of genes to extract data for
parameter: genes = []
# if list of genes is not provided we'll get from a fixed file `n` genes
parameter: n = 200

tissues_default = '''
Adipose_Subcutaneous
Adipose_Visceral_Omentum
Adrenal_Gland
Artery_Aorta
Artery_Coronary
Artery_Tibial
Brain_Amygdala
Brain_Anterior_cingulate_cortex_BA24
Brain_Caudate_basal_ganglia
Brain_Cerebellar_Hemisphere
Brain_Cerebellum
Brain_Cortex
Brain_Frontal_Cortex_BA9
Brain_Hippocampus
Brain_Hypothalamus
Brain_Nucleus_accumbens_basal_ganglia
Brain_Putamen_basal_ganglia
Brain_Spinal_cord_cervical_c-1
Brain_Substantia_nigra
Breast_Mammary_Tissue
Cells_Cultured_fibroblasts
Cells_EBV-transformed_lymphocytes
Colon_Sigmoid
Colon_Transverse
Esophagus_Gastroesophageal_Junction
Esophagus_Mucosa
Esophagus_Muscularis
Heart_Atrial_Appendage
Heart_Left_Ventricle
Kidney_Cortex
Liver
Lung
Minor_Salivary_Gland
Muscle_Skeletal
Nerve_Tibial
Ovary
Pancreas
Pituitary
Prostate
Skin_Not_Sun_Exposed_Suprapubic
Skin_Sun_Exposed_Lower_leg
Small_Intestine_Terminal_Ileum
Spleen
Stomach
Testis
Thyroid
Uterus
Vagina
Whole_Blood
'''

if len(tissues) == 0:
    tissues = tissues_default.strip().split("\n")
In [ ]:
[extract]
if len(genes) == 0:
    genes = get_output(f'shuf {cwd:a}/all_genes.txt | sort -u | head -{n} ').strip().split('\n')
tissue_tables = 'c(' + ','.join([f'"/{item}"' for item in tissues]) + ')'
depends: R_library('rhdf5'), R_library('plyr')
input: for_each = 'genes', concurrent = True
output: f"{outdir}/{tissues[0] if len(tissues) == 1 else 'Multi_Tissues'}.{path(_genes):b}.RDS"
R: expand = '${ }'
    library(rhdf5)
    load_genotype = function(genotype_file, geno_table) {
            geno = h5read(genotype_file, geno_table)
            gdata = data.frame(geno$block0_values)
            colnames(gdata) = geno$axis1
            rownames(gdata) = geno$axis0
            return(gdata)
    }
    load_expression = function(expr_file, expr_table, gene) {
            expr = h5read(expr_file, expr_table)
            edata = data.frame(expr$block0_values)
            colnames(edata) = tools::file_path_sans_ext(expr$axis1)
            edata = data.frame(edata[, gene])
            rownames(edata) = expr$axis0
            colnames(edata) = basename(expr_table)
            return(edata)
    }
    load_covariates = function(cov_file, cov_table) {
            covariate = h5read(cov_file, cov_table)
            cdata = data.frame(covariate$block0_values)
            colnames(cdata) = covariate$axis1
            rownames(cdata) = covariate$axis0
            cdata = t(cdata)
            return(cdata)
    }
    load_data_single = function(genotype_file, expr_file, cov_file, geno_table, expr_table, cov_table) {
            gdata = load_genotype(genotype_file, geno_table)
            edata = load_expression(expr_file, expr_table, basename(geno_table))
            cdata = load_covariates(cov_file, cov_table)
            # reorder
            edata = edata[match(rownames(cdata), rownames(edata)),]
            gdata = gdata[rownames(cdata),]
            return(list(X = as.matrix(gdata), y = as.vector(edata), Z = as.matrix(cdata), chrom = strsplit(colnames(gdata)[1], '_')[[1]][1], 
                        pos = as.integer(do.call(rbind, strsplit(colnames(gdata), '_'))[,2])))
    }
    load_data_multi = function(genotype_file, expr_file, cov_file, geno_table, expr_tables, cov_tables) {
            gdata = load_genotype(genotype_file, geno_table)
            edata = lapply(1:length(expr_tables), function(i) load_expression(expr_file, expr_tables[[i]], basename(geno_table)))
            cdata = lapply(1:length(cov_tables), function(i) load_covariates(cov_file, cov_tables[[i]]))
            y_res = list()
            # reorder and remove covariates
            for (i in 1:length(edata)) {
                idx = match(rownames(cdata[[i]]), rownames(edata[[i]]))
                row_str = rownames(edata[[i]])[idx]
                col_str = colnames(edata[[i]])
                edata[[i]] = edata[[i]][idx,]
                Z = scale(cdata[[i]], center=TRUE, scale=FALSE)
                y_res[[i]] = .lm.fit(Z, edata[[i]])$residuals
                y_res[[i]] = data.frame(matrix(y_res[[i]], length(y_res[[i]]), 1))
                y_res[[i]] = cbind(row_str, y_res[[i]])
                colnames(y_res[[i]]) = c('sample.id', col_str)
            }
            # merge residuals into one matrix
            # need to ensure same rows 
            # are matched and unmatched entries are filled with NA
            y_res_mat = plyr::join_all(y_res, by = "sample.id", type = "full", match = "all")
            y_res_mat = y_res_mat[match(rownames(gdata), y_res_mat$sample.id),]
            rownames(y_res_mat) = y_res_mat$sample.id
            y_res_mat = y_res_mat[,-1]
            # and match its order with genotype data
            #gdata = gdata[rownames(y_res_mat),]
            return(list(X = as.matrix(gdata), y = edata, Z = cdata, y_res = y_res_mat,
                        chrom = strsplit(colnames(gdata)[1], '_')[[1]][1], 
                        pos = as.integer(do.call(rbind, strsplit(colnames(gdata), '_'))[,2])))
    }
    #
    geno_table = "${_genes}"
    expr_table = cov_table = ${tissue_tables}
    if (length(expr_table) == 1) {
        dat = load_data_single("${genotype_file}", "${expr_file}", "${cov_file}", geno_table, expr_table, cov_table)
    } else {
        dat = load_data_multi("${genotype_file}", "${expr_file}", "${cov_file}", geno_table, expr_table, cov_table)        
    }
    saveRDS(dat, ${_output:r})

For (one) example,

In [ ]:
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr4/ENSG00000145214

To get an idea how many variants are there:

In [ ]:
[count_var]
input: glob.glob('~/Documents/GTExV8/Toys/*.RDS'), group_by = 1
output: '~/Documents/GTExV8/Toys/counts.txt'
R: expand = True, stdout = _output
  cat(paste0({_input:br}, ncol(readRDS({_input:r})$X),'\n'))
In [1]:
%cd ~/Documents/GTExV8/Toys
/home/gaow/Documents/GTExV8/Toys
In [2]:
dat = scan('counts.txt')
summary(dat)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   3022    6450    7226    7217    8033   11999 

For all the 27 genes in question in MASH paper,

In [ ]:
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr20/ENSG00000025772
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr17/ENSG00000056661
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr2/ENSG00000064012
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr16/ENSG00000089486
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr8/ENSG00000104472
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr17/ENSG00000108384
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr5/ENSG00000112977
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr17/ENSG00000120088
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr1/ENSG00000135744
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr3/ENSG00000136059
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr15/ENSG00000140265
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr4/ENSG00000145214
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr11/ENSG00000149054
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr1/ENSG00000160766
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr4/ENSG00000164124
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr12/ENSG00000177084
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr19/ENSG00000181240
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr17/ENSG00000187824
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr7/ENSG00000188732
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr1/ENSG00000189171
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr7/ENSG00000189316
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr15/ENSG00000198794
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr2/ENSG00000225439
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr3/ENSG00000249846
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr18/ENSG00000264247
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr19/ENSG00000267508
sos run analysis/20180515_Extract_Benchmark_Data.ipynb extract --genes /chr1/ENSG00000272030

Copyright © 2016-2020 Gao Wang et al at Stephens Lab, University of Chicago