This analysis uses some early prototypes of M&M.
Current implementation of M&M in libgaow
is coded in Python and, inspired by this version of algorithm in the univarate case, encapsulates lots of computation in MASH. Also it has not implemented ELBO calculation yet.
For convenience of prototyping, I use Python implementation of MASH core computations (also in libgaow
). Also I'll do an interactive analysis rather than running pipelines.
We start with $N \approx 800$ samples and $R = 2$ conditions (thyroid and lung), on about $J = 7500$ effects for given gene FMO2.
MASH
on $\hat{\beta}_{P \times R}$ and $S_{P \times R}$ where $P$ is the number of genes in GTEx data for these tissues: this is the standard MASH procedure where we learn the prior matrices from top eQTLs in data and their weights from null eQTLs. input_data = "~/Documents/GTExV8/MASH/GTExV8.ciseQTL.4MASH.rds"
setwd('~/Documents/GTExV8/MNM')
z = readRDS(input_data)$max$z
z = z[,c('Thyroid', 'Lung')]
write.table(z, "/tmp/mnm.max.txt", col.names=F,row.names=F)
cmd = paste0('~/Documents/GTExV8/utils/sfa/bin/sfa_linux -gen /tmp/mnm.max.txt -g ', dim(z)[1], ' -n ', dim(z)[2],
' -k 2 -iter 50 -rand 999 -o /tmp/mnm.max')
system(cmd)
saveRDS(list(F = read.table("/tmp/mnm.max_F.out"),
lambda = read.table("/tmp/mnm.max_lambda.out"),
sigma2 = read.table("/tmp/mnm.max_sigma2.out"),
alpha = read.table("/tmp/mnm.max_alpha.out")), 'TL_MASH.sfa.rds')
sfa_data = readRDS("TL_MASH.sfa.rds")
mash_data = mashr::set_mash_data(z, matrix(1, nrow(z), ncol(z)))
sfa_res = as.matrix(sfa_data$lambda) %*% as.matrix(sfa_data$F)
# SFA matrices
U.sfa = c(mashr::cov_from_factors(as.matrix(sfa_data$F), "sfa2"), list("tSFA" = t(sfa_res) %*% sfa_res / nrow(z)))
# SVD matrices
U.pca = mashr::cov_pca(mash_data, 2)
# Emperical data matrices
# `cov_ed` will take significantly longer when this empirical convariance matrix is added
D.center = apply(as.matrix(z), 2, function(x) x - mean(x))
# Denoised data-driven matrices
U.dd = c(U.sfa, U.pca, list("XX" = t(D.center) %*% D.center / nrow(z)))
U.ed = mashr::cov_ed(mash_data, U.dd)
# Canonical matrices
U.can = mashr::cov_canonical(mash_data)
saveRDS(list(Ulist = c(U.ed, U.can), DD_raw = U.dd), 'TL_MASH.priors.rds')
dat = readRDS(input_data)
z = as.matrix(dat$null$z)[,c('Thyroid', 'Lung')]
V = cor(z[which(apply(abs(z),1, max) < 2),])
mash_data = mashr::set_mash_data(as.matrix(z), matrix(1, nrow(z), ncol(z)), as.matrix(V))
saveRDS(mashr::mash(mash_data, Ulist = readRDS('TL_MASH.priors.rds')$Ulist, usepointmass = FALSE, outputlevel = 1), 'TL_MASH.fit.rds')
dat = readRDS('TL_MASH.fit.rds')
str(dat)
Ulist = mashr::expand_cov(dat$fitted_g$Ulist, dat$fitted_g$grid, usepointmass = dat$fitted_g$usepointmass)
U_names = names(Ulist)
pi_s = dat$fitted_g$pi
which.comp = (pi_s > 1E-10)
U_names = U_names[which.comp]
pi_s = pi_s[which.comp]
print(length(Ulist))
print(length(pi_s))
%get Ulist U_names pi_s --from R
from collections import OrderedDict
U = OrderedDict([(k, Ulist[k.replace('.', '_')]) for k in U_names])
pi_s
U
dat = readRDS('/home/gaow/Documents/GTExV8/Thyroid.Lung.FMO2.filled.rds')
attach(dat)
%get X Y --from R
Y = Y.as_matrix()
These weights are learned from 2 tissues in the MASH EB framework, without null
component -- because the null
will have almost 100% weight in MASH! Need to figure out what's going on.
Here, without null
, as shown in 2 cells above, thyroid specific prior has heightest weight (over 96%).
from libgaow.regression_data import MNMASH
model = MNMASH(X=X,Y=Y)
model.set_prior(U, pi = pi_s)
model.fit(niter = 100)
import numpy as np
np.save('mnm_post_mean.npy', model.post_mean_mat)
import seaborn as sns
import matplotlib.pyplot as plt
plt.scatter([x+1 for x in range(len(model.post_mean_mat[:,0]))], model.post_mean_mat[:,0], cmap="viridis")
ax = plt.gca()
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
plt.scatter([x+1 for x in range(len(model.post_mean_mat[:,1]))], model.post_mean_mat[:,1], cmap="viridis")
ax = plt.gca()
plt.show()
model.post_mean_mat
null
weight to 0.5I then modify U
and pi_s
and re-fit.
U['null'] = np.zeros((2,2))
U
pi_s = [0.15,0.25,0.1,0.5]
model2 = MNMASH(X=X,Y=Y)
model2.set_prior(U, pi = pi_s)
model2.fit(niter = 100)
np.save('mnm_assigned_post_mean.npy', model2.post_mean_mat)
plt.scatter([x+1 for x in range(len(model2.post_mean_mat[:,0]))], model2.post_mean_mat[:,0], cmap="viridis")
ax = plt.gca()
plt.show()
plt.scatter([x+1 for x in range(len(model2.post_mean_mat[:,1]))], model2.post_mean_mat[:,1], cmap="viridis")
ax = plt.gca()
plt.show()
model2.mash.U
model2.mash.pi
varbvs
¶n <- nrow(X)
p <- ncol(X)
set.seed(1)
varbvs_fit1 <- varbvs::varbvs(X,NULL, Y[,1],verbose = FALSE)
varbvs_bhat1 = rowSums(varbvs_fit1$alpha*varbvs_fit1$mu)
varbvs_fit2 <- varbvs::varbvs(X,NULL, Y[,2],verbose = FALSE)
varbvs_bhat2 = rowSums(varbvs_fit2$alpha*varbvs_fit2$mu)
varbvs_bhat1 = as.vector(varbvs_bhat1)
varbvs_bhat2 = as.vector(varbvs_bhat2)
%get varbvs_bhat1 varbvs_bhat2 --from R
plt.scatter([x+1 for x in range(len(varbvs_bhat1))], varbvs_bhat1, cmap="viridis")
ax = plt.gca()
plt.show()
plt.scatter([x+1 for x in range(len(varbvs_bhat2))], varbvs_bhat2, cmap="viridis")
ax = plt.gca()
plt.show()