cis-eQTL analysis using mr-ash
.
This notebook analyzes a toy example previously created. I'm creating this workflow to the best of my knowledge to make it general enough for use with later analysis. Ideally by running:
sos run 20170814_MR_ASH_GTEx.ipynb mr_ash \
--tissues Lung \
--geno /path/to/geno.h5 \
--expr /path/to/expr.h5 \
--covar /path/to/covar.h5
(@wei it is generally difficult to write reusuable bioinformatics workflows unless with lots of engineering that I cannot afford, but we'll see how far we can go from here!)
Code chunk below implements the workflow.
[global]
depends: Py_Module('h5py')
import h5py
parameter: cwd = '~/Documents/GTEx'
parameter: geno = "${cwd!a}/ToyExample/3mashgenes.genotype.h5"
parameter: expr = "${cwd!a}/ToyExample/3mashgenes.expr.h5"
parameter: covar = "${cwd!a}/h5_formatted/GTEx7.covariates.orth.h5"
parameter: tissues = list(h5py.File(expr, libver='latest').keys())
f = h5py.File(geno, libver='latest')
parameter: genes = []
for k in f:
genes.extend(['{}/{}'.format(k, x) for x in f[k]])
f.close()
parameter: seed = 999
%sosrun mr_ash --tissues Lung
[mr_ash_1]
depends: R_library('rhdf5'), R_library('tools')
input: for_each = ['genes', 'tissues']
output: "${cwd!a}/mr.ash/${geno!bnn}/${_tissues}_${_genes!b}.rds"
task: workdir = cwd
R:
load_data = function(genotype_file, expr_file, cov_file, geno_table, expr_table, cov_table) {
library(rhdf5)
geno = h5read(genotype_file, geno_table)
gdata = data.frame(geno$block0_values)
colnames(gdata) = geno$axis1
rownames(gdata) = geno$axis0
expr = h5read(expr_file, expr_table)
edata = data.frame(expr$block0_values)
# colnames(edata) = expr$axis1
colnames(edata) = tools::file_path_sans_ext(expr$axis1)
# rownames(edata) = expr$axis0
rownames(edata) = apply(sapply(strsplit(expr$axis0,"-"), `[`, c(1,2)), 2, function(x) paste(x, collapse = '-'))
covariate <- h5read(cov_file, cov_table)
cdata = data.frame(covariate$block0_values)
colnames(cdata) = apply(sapply(strsplit(covariate$axis1,"-"), `[`, c(1,2)), 2, function(x) paste(x, collapse = '-'))
rownames(cdata) = covariate$axis0
# extract gene of interest and ensure samples from expression table and genotype table match
edata = data.frame(edata[, basename(geno_table)])
edata$ID = rownames(edata)
gdata$ID = rownames(gdata)
output = merge(x = edata, y = gdata, by = "ID", all.x = TRUE)
return(list(X=as.matrix(output[,-c(1,2)]), y = as.vector(output$edata), Z = as.matrix(cdata)))
}
autoselect.mixsd = function(betahat,sebetahat,mult = sqrt(2)){
# To avoid exact measure causing (usually by mistake)
sebetahat = sebetahat[sebetahat!=0]
# so that the minimum is small compared with measurement precision
sigmaamin = min(sebetahat)/10
if (all(betahat^2 <= sebetahat^2)) {
# to deal with the occassional odd case where this could happen; 8 is arbitrary
sigmaamax = 8*sigmaamin
} else {
# this computes a rough largest value you'd want to use,
# based on idea that sigmaamax^2 + sebetahat^2 should be at least betahat^2
sigmaamax = 2*sqrt(max(betahat^2-sebetahat^2))
}
if(mult==0){
return(c(0,sigmaamax/2))
} else {
npoint = ceiling(log2(sigmaamax/sigmaamin)/log2(mult))
return(mult^((-npoint):0) * sigmaamax)
}
}
initial_step = function(X,y,Z = NULL){
P = dim(X)[2]
output = matrix(0,nrow = P,ncol = 2)
for(i in 1:P){
if(is.null(Z)){
g = summary(lm(y~X[,i]))
} else{
g = summary(lm(y~X[,i]+Z))
}
output[i,] = g$coefficients[2,1:2]
}
return(list(betahat = output[,1],sebetahat = output[,2]))
}
analyze = function(genename = '/chr4/ENSG00000145214', tissue = '/Lung', out = 'test.rds'){
dat = load_data(genotype_file = ${geno!ar},
expr_file = ${expr!ar},
cov_file = ${covar!ar},
geno_table = genename,
expr_table = tissue,
cov_table = tissue)
X = as.matrix(dat$X)
X = X[,which(colSums(X)!=0)]
if ((nrow(X) == 0) || (ncol(X) == 0)) {
saveRDS(list(), out)
} else {
storage.mode(X) <- "double"
y = as.vector(dat$y)
Z = as.matrix(dat$Z)
initial = initial_step(X,y,Z)
mixsd = autoselect.mixsd(initial$betahat,initial$sebetahat)
logdata = capture.output({res = varbvs::varbvsmix(X, Z, y, sa = c(0,mixsd^2)) })
betahat = rowSums(res$alpha * res$mu)
names(betahat) = colnames(X)
mrash_out = list(betahat = betahat, lfsr = res$lfsr)
ash_out = ashr::ash(initial$betahat,initial$sebetahat,mixcompdist = "normal")
saveRDS(list(ash = ash_out, uni = initial, mr_ash = mrash_out, logdata = logdata), out)
}
}
set.seed(${seed})
analyze(genename = "/${_genes}", tissue = "/${_tissues}", out = ${_output!r})
%sessioninfo