varbvs
and mr-ash
¶In this example we analyze a data set where there is potentially multiple SNPs associated with gene expression. We use both bvs
and mr-ash
methods, initializing both of them with results from lasso
.
autoselect.mixsd = function(betahat,sebetahat,mult = sqrt(2)){
# To avoid exact measure causing (usually by mistake)
sebetahat = sebetahat[sebetahat!=0]
# so that the minimum is small compared with measurement precision
sigmaamin = min(sebetahat)/10
if (all(betahat^2 <= sebetahat^2)) {
# to deal with the occassional odd case where this could happen; 8 is arbitrary
sigmaamax = 8*sigmaamin
} else {
# this computes a rough largest value you'd want to use,
# based on idea that sigmaamax^2 + sebetahat^2 should be at least betahat^2
sigmaamax = 2*sqrt(max(betahat^2-sebetahat^2))
}
if(mult==0){
return(c(0,sigmaamax/2))
} else {
npoint = ceiling(log2(sigmaamax/sigmaamin)/log2(mult))
return(mult^((-npoint):0) * sigmaamax)
}
}
univariate_regression = function(X,y,Z = NULL){
P = dim(X)[2]
if (!is.null(Z)) {
y = lm(y~Z)$residuals
}
output = matrix(0,nrow = P,ncol = 2)
for(i in 1:P){
g = summary(lm(y ~ X[,i]))
output[i,] = g$coefficients[2,1:2]
}
return(list(betahat = output[,1], sebetahat = output[,2],
residuals = y))
}
lasso_reorder = function(X, y) {
# perform lasso regression and reorder regressors by "importance"
fit.glmnet <- glmnet::glmnet(X, y, intercept = F)
beta_path = coef(fit.glmnet)[-1,]
K = dim(beta_path)[2]
path_order = c()
for (k in 1:K) {
crt_path = which(beta_path[,k] != 0)
if (length(crt_path) != 0 & length(path_order) == 0) {
path_order = c(path_order, crt_path)
} else if(length(crt_path) != 0) {
path_order = c(path_order, crt_path[-which(crt_path %in% path_order)] )
}
}
path_order = unname(path_order)
index_order = c(path_order, seq(1,dim(beta_path)[1])[-path_order])
return(index_order)
}
# compute fitted values from varbvsmix
fitfv = function(X,Z,fit){
bhat = rowSums(fit$alpha*fit$mu)
return(X %*% bhat + fit$mu.cov[1] + Z %*% fit$mu.cov[-1])
}
input_file = "../data/Thyroid.FMO2.1Mb.RDS"
dat = readRDS(input_file)
X = as.matrix(dat$X)
X = X[,which(colSums(X)!=0)]
storage.mode(X) <- "double"
y = as.vector(dat$y)
Z = as.matrix(dat$Z)
# univariate results
initial = univariate_regression(X, y ,Z)
mixsd = autoselect.mixsd(initial$betahat, initial$sebetahat)
mu_zero = matrix(0, ncol = length(mixsd)+1, nrow = ncol(X))
alpha_zero = matrix(1/ncol(X), ncol = length(mixsd)+1,nrow = ncol(X))
alpha_zero[,1] = 1 - length(mixsd) / ncol(X)
index_order = lasso_reorder(X, initial$residuals)
# bvsmix with lasso init vs random init
fit11 = varbvs::varbvsmix(X[, index_order],
Z, y, sa = c(0,mixsd^2),
mu = mu_zero,
alpha = alpha_zero,
verbose = F)
fit12 = varbvs::varbvsmix(X,
Z, y, sa = c(0,mixsd^2),
verbose = F)
# bvsmix with lasso init vs random init
fit21 = varbvs::varbvs(X[, index_order], Z, y, mu = mu_zero, alpha = alpha_zero, verbose = F)
fit22 = varbvs::varbvs(X, Z, y, verbose = F)
mix.top = names(sort(fit11$lfsr)[1:5])
cbind(1 - fit11$lfsr[mix.top], fit21$alpha[,1][mix.top])
bvs.top = names(sort(fit21$alpha[,1], decreasing=T)[1:5])
cbind(1 - fit11$lfsr[bvs.top], fit21$alpha[,1][bvs.top])
Check if chr1_171122735_A_G_b38
is in credible set of varbvs
result
get.credible.set <- function (bf, pr) {
pp <- bf/sum(bf)
markers <- order(bf,decreasing = TRUE)
n <- min(which(cumsum(pp[markers]) > credint.prob))
return(markers[1:n])
}
top.markers <- summary(fit21,nv = 20)$top.vars
top.markers <- subset(top.markers,prob > 0.95)$variable
m <- length(top.markers)
R <- cor(X)
credint.prob <- 0.95
ld.threshold <- 0.2
cr <- vector("list",m)
plots <- vector("list",m)
names(cr) <- top.markers
names(plots) <- top.markers
for (i in top.markers) {
idx <- which(colnames(X) == i)
vars <- which(R[,i]^2 >= ld.threshold)
out <- varbvs::varbvsproxybf(X,Z,y,fit21,idx,vars)
cr[[i]] <- vars[get.credible.set(out$BF[,1],credint.prob)]
j <- names(which.max(out$BF[,1]))
}
print(cr)
No, it is not ...
Next I'll try to come up with a smaller set of SNPs, say 100, that includes top hits from both methods along with random SNPs being the rest. Will see if there is still difference in behavior.
X2 = X[,unique(c(mix.top[1:2], bvs.top[1:3], sample(colnames(X), 100)))]
dim(X2)
initial = univariate_regression(X2, y ,Z)
mixsd = autoselect.mixsd(initial$betahat, initial$sebetahat)
mu_zero = matrix(0, ncol = length(mixsd)+1, nrow = ncol(X2))
alpha_zero = matrix(1/ncol(X2), ncol = length(mixsd)+1,nrow = ncol(X2))
alpha_zero[,1] = 1 - length(mixsd) / ncol(X2)
index_order = lasso_reorder(X2, initial$residuals)
# bvsmix with lasso init vs random init
fit31 = varbvs::varbvsmix(X2[, index_order],
Z, y, sa = c(0,mixsd^2),
mu = mu_zero,
alpha = alpha_zero,
verbose = F)
fit32 = varbvs::varbvsmix(X2,
Z, y, sa = c(0,mixsd^2),
verbose = F)
# bvsmix with lasso init vs random init
fit41 = varbvs::varbvs(X2[, index_order], Z, y, verbose = F)
fit42 = varbvs::varbvs(X2, Z, y, verbose = F)
mix.top2 = names(sort(fit31$lfsr)[1:5])
cbind(1 - fit31$lfsr[mix.top2], fit41$alpha[,1][mix.top2])
bvs.top2 = names(sort(fit41$alpha[,1], decreasing=T)[1:5])
cbind(1 - fit31$lfsr[bvs.top2], fit41$alpha[,1][bvs.top2])
With 7.5K SNPs it identifies:
chr1_171172098_C_T_b38
chr1_171133158_A_G_b38
chr1_171199984_T_C_b38
and it missed one from varbvsmix
:
chr1_171122735_A_G_b38
With 100 SNPs it now picks up the one previously missed according to varbvsmix
:
chr1_171172098_C_T_b38
chr1_171199984_T_C_b38
chr1_171122735_A_G_b38
But dropped
chr1_171133158_A_G_b38
With 7.5K SNPs it identifies
chr1_171172098_C_T_b38
chr1_171122735_A_G_b38
which overlaps with varbvs
on only chr1_171172098_C_T_b38
.
With 100 SNPs it now picks up 4 SNPs
chr1_171172098_C_T_b38
chr1_171199984_T_C_b38
chr1_171122735_A_G_b38
chr1_171133158_A_G_b38
Which is in fact the union of hits identified by both varbvsmix
and varbvs
when input is 7.5K SNPs
R[c('chr1_171172098_C_T_b38', 'chr1_171199984_T_C_b38', 'chr1_171122735_A_G_b38', 'chr1_171133158_A_G_b38'),c('chr1_171172098_C_T_b38', 'chr1_171199984_T_C_b38', 'chr1_171122735_A_G_b38', 'chr1_171133158_A_G_b38')]
# This is the top 4 markers identified with random varbvs init
markers3 <- c("chr1_171168633_C_A_b38",
"chr1_171147265_C_A_b38",
"chr1_171164750_C_A_b38",
"chr1_171178589_C_T_b38")
R[markers3, markers3]