Multivariate Bayesian variable selection regression

MR ASH on single tissue data

cis-eQTL analysis using mr-ash, for GTEx V8.

In [ ]:
[global]
parameter: seed = 999
parameter: cwd = '~/Documents/GTExV8'
parameter: geno = "${cwd!a}/GTExV8.genotype.cis.h5"
parameter: expr = "${cwd!a}/GTExV8.expression.h5"
parameter: covar = "${cwd!a}/GTExV8.covariates.orth.h5"
parameter: tissue = "Thyroid"
parameter: gene_list = ''
In [ ]:
[default_0: shared = {'genes': 'genes'}]
depends: Py_Module('h5py')
import h5py
all_genes = []
with h5py.File(geno, libver='latest') as f:
    for k in f:
        all_genes.extend(['{}/{}'.format(k, x) for x in f[k]])
if os.path.isfile(gene_list):
    genes = [x for x in get_output("cat {}".format(gene_list)).strip().split() if x in all_genes]
else:
    genes = all_genes
expr_genes = [os.path.splitext(x.decode())[0] for x in h5py.File(expr, libver = 'latest')[tissue]['axis1']]
genes = [y for y in genes if os.path.basename(y) in expr_genes]
fail_if(len(genes) == 0, "No genes to analyze!")
print('{} genes to analyze for {}'.format(len(genes), tissue))
In [ ]:
[default_1]
# utils
output: "${cwd!a}/.utils.R"
report: output = "${output}"
load_data = function(genotype_file, expr_file, cov_file, geno_table, expr_table, cov_table) {
    library(rhdf5)
    geno = h5read(genotype_file, geno_table)
    gdata = data.frame(geno$block0_values)
    colnames(gdata) = geno$axis1
    rownames(gdata) = geno$axis0
    #
    expr = h5read(expr_file, expr_table)
    edata = data.frame(expr$block0_values)
    colnames(edata) = tools::file_path_sans_ext(expr$axis1)
    edata = data.frame(edata[, basename(geno_table)])
    rownames(edata) = expr$axis0
    colnames(edata) = basename(geno_table)
    #
    covariate <- h5read(cov_file, cov_table)
    cdata = data.frame(covariate$block0_values)
    colnames(cdata) = covariate$axis1
    rownames(cdata) = covariate$axis0
    cdata = t(cdata)
    # reorder
    edata = edata[match(rownames(cdata), rownames(edata)),]
    gdata = gdata[rownames(cdata),]
    return(list(X = as.matrix(gdata), y = as.vector(edata), Z = as.matrix(cdata)))
}

autoselect.mixsd = function(betahat,sebetahat,mult = sqrt(2)){
    # To avoid exact measure causing (usually by mistake)
    sebetahat = sebetahat[sebetahat!=0] 
    # so that the minimum is small compared with measurement precision
    sigmaamin = min(sebetahat)/10 
    if (all(betahat^2 <= sebetahat^2)) {
        # to deal with the occassional odd case where this could happen; 8 is arbitrary
        sigmaamax = 8*sigmaamin 
    } else {
        # this computes a rough largest value you'd want to use, 
        # based on idea that sigmaamax^2 + sebetahat^2 should be at least betahat^2   
        sigmaamax = 2*sqrt(max(betahat^2-sebetahat^2)) 
    }
    if(mult==0){
        return(c(0,sigmaamax/2))
    } else {
        npoint = ceiling(log2(sigmaamax/sigmaamin)/log2(mult))
        return(mult^((-npoint):0) * sigmaamax)
    }
}

univariate_regression = function(X, y, Z = NULL){
    if (!is.null(Z)) {
        y = .lm.fit(Z, y)$residuals
    }
    calc_stderr = function(X, residuals) { sqrt(diag(sum(residuals^2) / (nrow(X) - 2) * chol2inv(chol(t(X) %*% X)))) }
    output = do.call(rbind, 
                  lapply(c(1:ncol(X)), function(i) { 
                      g = .lm.fit(cbind(1, X[,i]), y)
                      return(c(coef(g)[2], calc_stderr(cbind(1, X[,i]), g$residuals)[2]))
                  })
                 )
    return(list(betahat = output[,1], sebetahat = output[,2], 
                residuals = y))
}

lasso_reorder = function(X, y) {
    # perform lasso regression and reorder regressors by "importance"
    fit.glmnet <- glmnet::glmnet(X, y, intercept = F)
    beta_path = coef(fit.glmnet)[-1,]
    K = dim(beta_path)[2]
    path_order = c()
    for (k in 1:K) {
        crt_path = which(beta_path[,k] != 0)
        if (length(crt_path) != 0 & length(path_order) == 0) {
            path_order = c(path_order, crt_path)
        } else if(length(crt_path) != 0) {
            path_order = c(path_order, crt_path[-which(crt_path %in% path_order)] )
        }
    }
    path_order = unname(path_order)
    index_order = c(path_order, seq(1,dim(beta_path)[1])[-path_order])
    return(index_order)
}


analyze = function(genename = '/chr4/ENSG00000145214', tissue = '/Thyroid', out = 'test.rds'){
    dat = load_data(genotype_file = ${geno!ar},
                  expr_file = ${expr!ar},
                  cov_file = ${covar!ar},
                  geno_table = genename,
                  expr_table = tissue,
                  cov_table = tissue)
    X = as.matrix(dat$X)
    X = X[,which(colSums(X)!=0)]
    if ((nrow(X) == 0) || (ncol(X) == 0)) {
    saveRDS(list(), out)
    } else {
    storage.mode(X) <- "double"
    y = as.vector(dat$y)
    Z = as.matrix(dat$Z)
    # univariate results
    initial = univariate_regression(X, y, Z)
    mixsd = autoselect.mixsd(initial$betahat, initial$sebetahat)
    mu_zero = matrix(0, ncol = length(mixsd)+1, nrow = ncol(X))
    alpha_zero = matrix(1/ncol(X), ncol = length(mixsd)+1,nrow = ncol(X))
    alpha_zero[,1] = 1 - length(mixsd) / ncol(X)
    index_order = lasso_reorder(X, initial$residuals)
    logdata = capture.output({ fit = varbvs::varbvsmix(X[, index_order], Z, y, 
                                                      sa = c(0,mixsd^2), 
                                                      mu = mu_zero,
                                                      alpha = alpha_zero,
                                                      verbose = F) })
    betahat = rowSums(fit$alpha * fit$mu)
    names(betahat) = colnames(X)
    mr_ash_out = list(betahat = betahat, fit = fit)
    # ash_out = ashr::ash(initial$betahat,initial$sebetahat,mixcompdist = "normal")
    # saveRDS(list(ash = ash_out, uni = initial, mr_ash = mr_ash_out, logdata = logdata), out)
    saveRDS(list(univariate = initial, mr_ash = mr_ash_out, logdata = logdata), out)
    }
}
In [2]:
[default_2]
depends: R_library('rhdf5'), R_library('tools'), R_library('glmnet')
input: None, for_each = ['genes']
output: "${cwd!a}/MR_ASH/${tissue}/${tissue}_${_genes!b}.rds"
task: workdir = cwd, walltime = '5m', cores = 1, mem = "2G", 
        trunk_size = 400, trunk_workers = 1
R:
    source('${cwd!a}/.utils.R')                                    
    set.seed(${seed})
    analyze(genename = "/${_genes}", tissue = "/${tissue}", out = ${_output!r})

Run workflow on remote computer

For given tissue:

sos run analysis/20171002_MR_ASH_V8.ipynb -r midway2 -W \
    --tissue Thyroid \
    --gene_list optional_gene_list.txt

Note that the remote computer (login node) as already been configured with

sos config --global --set localhost midway2
In [1]:
%sessioninfo

SoS

SoS Version
0.9.9.1

Copyright © 2016-2020 Gao Wang et al at Stephens Lab, University of Chicago