cis-eQTL analysis using mr-ash
, for GTEx V8.
[global]
parameter: seed = 999
parameter: cwd = '~/Documents/GTExV8'
parameter: geno = "${cwd!a}/GTExV8.genotype.cis.h5"
parameter: expr = "${cwd!a}/GTExV8.expression.h5"
parameter: covar = "${cwd!a}/GTExV8.covariates.orth.h5"
parameter: tissue = "Thyroid"
parameter: gene_list = ''
[default_0: shared = {'genes': 'genes'}]
depends: Py_Module('h5py')
import h5py
all_genes = []
with h5py.File(geno, libver='latest') as f:
for k in f:
all_genes.extend(['{}/{}'.format(k, x) for x in f[k]])
if os.path.isfile(gene_list):
genes = [x for x in get_output("cat {}".format(gene_list)).strip().split() if x in all_genes]
else:
genes = all_genes
expr_genes = [os.path.splitext(x.decode())[0] for x in h5py.File(expr, libver = 'latest')[tissue]['axis1']]
genes = [y for y in genes if os.path.basename(y) in expr_genes]
fail_if(len(genes) == 0, "No genes to analyze!")
print('{} genes to analyze for {}'.format(len(genes), tissue))
[default_1]
# utils
output: "${cwd!a}/.utils.R"
report: output = "${output}"
load_data = function(genotype_file, expr_file, cov_file, geno_table, expr_table, cov_table) {
library(rhdf5)
geno = h5read(genotype_file, geno_table)
gdata = data.frame(geno$block0_values)
colnames(gdata) = geno$axis1
rownames(gdata) = geno$axis0
#
expr = h5read(expr_file, expr_table)
edata = data.frame(expr$block0_values)
colnames(edata) = tools::file_path_sans_ext(expr$axis1)
edata = data.frame(edata[, basename(geno_table)])
rownames(edata) = expr$axis0
colnames(edata) = basename(geno_table)
#
covariate <- h5read(cov_file, cov_table)
cdata = data.frame(covariate$block0_values)
colnames(cdata) = covariate$axis1
rownames(cdata) = covariate$axis0
cdata = t(cdata)
# reorder
edata = edata[match(rownames(cdata), rownames(edata)),]
gdata = gdata[rownames(cdata),]
return(list(X = as.matrix(gdata), y = as.vector(edata), Z = as.matrix(cdata)))
}
autoselect.mixsd = function(betahat,sebetahat,mult = sqrt(2)){
# To avoid exact measure causing (usually by mistake)
sebetahat = sebetahat[sebetahat!=0]
# so that the minimum is small compared with measurement precision
sigmaamin = min(sebetahat)/10
if (all(betahat^2 <= sebetahat^2)) {
# to deal with the occassional odd case where this could happen; 8 is arbitrary
sigmaamax = 8*sigmaamin
} else {
# this computes a rough largest value you'd want to use,
# based on idea that sigmaamax^2 + sebetahat^2 should be at least betahat^2
sigmaamax = 2*sqrt(max(betahat^2-sebetahat^2))
}
if(mult==0){
return(c(0,sigmaamax/2))
} else {
npoint = ceiling(log2(sigmaamax/sigmaamin)/log2(mult))
return(mult^((-npoint):0) * sigmaamax)
}
}
univariate_regression = function(X, y, Z = NULL){
if (!is.null(Z)) {
y = .lm.fit(Z, y)$residuals
}
calc_stderr = function(X, residuals) { sqrt(diag(sum(residuals^2) / (nrow(X) - 2) * chol2inv(chol(t(X) %*% X)))) }
output = do.call(rbind,
lapply(c(1:ncol(X)), function(i) {
g = .lm.fit(cbind(1, X[,i]), y)
return(c(coef(g)[2], calc_stderr(cbind(1, X[,i]), g$residuals)[2]))
})
)
return(list(betahat = output[,1], sebetahat = output[,2],
residuals = y))
}
lasso_reorder = function(X, y) {
# perform lasso regression and reorder regressors by "importance"
fit.glmnet <- glmnet::glmnet(X, y, intercept = F)
beta_path = coef(fit.glmnet)[-1,]
K = dim(beta_path)[2]
path_order = c()
for (k in 1:K) {
crt_path = which(beta_path[,k] != 0)
if (length(crt_path) != 0 & length(path_order) == 0) {
path_order = c(path_order, crt_path)
} else if(length(crt_path) != 0) {
path_order = c(path_order, crt_path[-which(crt_path %in% path_order)] )
}
}
path_order = unname(path_order)
index_order = c(path_order, seq(1,dim(beta_path)[1])[-path_order])
return(index_order)
}
analyze = function(genename = '/chr4/ENSG00000145214', tissue = '/Thyroid', out = 'test.rds'){
dat = load_data(genotype_file = ${geno!ar},
expr_file = ${expr!ar},
cov_file = ${covar!ar},
geno_table = genename,
expr_table = tissue,
cov_table = tissue)
X = as.matrix(dat$X)
X = X[,which(colSums(X)!=0)]
if ((nrow(X) == 0) || (ncol(X) == 0)) {
saveRDS(list(), out)
} else {
storage.mode(X) <- "double"
y = as.vector(dat$y)
Z = as.matrix(dat$Z)
# univariate results
initial = univariate_regression(X, y, Z)
mixsd = autoselect.mixsd(initial$betahat, initial$sebetahat)
mu_zero = matrix(0, ncol = length(mixsd)+1, nrow = ncol(X))
alpha_zero = matrix(1/ncol(X), ncol = length(mixsd)+1,nrow = ncol(X))
alpha_zero[,1] = 1 - length(mixsd) / ncol(X)
index_order = lasso_reorder(X, initial$residuals)
logdata = capture.output({ fit = varbvs::varbvsmix(X[, index_order], Z, y,
sa = c(0,mixsd^2),
mu = mu_zero,
alpha = alpha_zero,
verbose = F) })
betahat = rowSums(fit$alpha * fit$mu)
names(betahat) = colnames(X)
mr_ash_out = list(betahat = betahat, fit = fit)
# ash_out = ashr::ash(initial$betahat,initial$sebetahat,mixcompdist = "normal")
# saveRDS(list(ash = ash_out, uni = initial, mr_ash = mr_ash_out, logdata = logdata), out)
saveRDS(list(univariate = initial, mr_ash = mr_ash_out, logdata = logdata), out)
}
}
[default_2]
depends: R_library('rhdf5'), R_library('tools'), R_library('glmnet')
input: None, for_each = ['genes']
output: "${cwd!a}/MR_ASH/${tissue}/${tissue}_${_genes!b}.rds"
task: workdir = cwd, walltime = '5m', cores = 1, mem = "2G",
trunk_size = 400, trunk_workers = 1
R:
source('${cwd!a}/.utils.R')
set.seed(${seed})
analyze(genename = "/${_genes}", tissue = "/${tissue}", out = ${_output!r})
For given tissue:
sos run analysis/20171002_MR_ASH_V8.ipynb -r midway2 -W \
--tissue Thyroid \
--gene_list optional_gene_list.txt
Note that the remote computer (login node) as already been configured with
sos config --global --set localhost midway2
%sessioninfo