Multivariate Bayesian variable selection regression

Analyzing a toy example with varbvs and mr-ash

In this example we analyze a data set where there is potentially multiple SNPs associated with gene expression. We use both bvs and mr-ash methods, initializing both of them with results from lasso.

Utility functions

In [1]:
autoselect.mixsd = function(betahat,sebetahat,mult = sqrt(2)){
        # To avoid exact measure causing (usually by mistake)
        sebetahat = sebetahat[sebetahat!=0] 
        # so that the minimum is small compared with measurement precision
        sigmaamin = min(sebetahat)/10 
        if (all(betahat^2 <= sebetahat^2)) {
            # to deal with the occassional odd case where this could happen; 8 is arbitrary
            sigmaamax = 8*sigmaamin 
        } else {
            # this computes a rough largest value you'd want to use, 
            # based on idea that sigmaamax^2 + sebetahat^2 should be at least betahat^2   
            sigmaamax = 2*sqrt(max(betahat^2-sebetahat^2)) 
        }
        if(mult==0){
            return(c(0,sigmaamax/2))
        } else {
            npoint = ceiling(log2(sigmaamax/sigmaamin)/log2(mult))
            return(mult^((-npoint):0) * sigmaamax)
        }
    }
univariate_regression = function(X,y,Z = NULL){
        P = dim(X)[2]
        if (!is.null(Z)) {
            y = lm(y~Z)$residuals
        }
        output = matrix(0,nrow = P,ncol = 2)
        for(i in 1:P){
          g = summary(lm(y ~ X[,i]))
          output[i,] = g$coefficients[2,1:2]
        }
        return(list(betahat = output[,1], sebetahat = output[,2], 
                    residuals = y))
    }
lasso_reorder = function(X, y) {
        # perform lasso regression and reorder regressors by "importance"
        fit.glmnet <- glmnet::glmnet(X, y, intercept = F)
        beta_path = coef(fit.glmnet)[-1,]
        K = dim(beta_path)[2]
        path_order = c()
        for (k in 1:K) {
            crt_path = which(beta_path[,k] != 0)
            if (length(crt_path) != 0 & length(path_order) == 0) {
                path_order = c(path_order, crt_path)
            } else if(length(crt_path) != 0) {
                path_order = c(path_order, crt_path[-which(crt_path %in% path_order)] )
            }
        }
        path_order = unname(path_order)
        index_order = c(path_order, seq(1,dim(beta_path)[1])[-path_order])
        return(index_order)
    }

# compute fitted values from varbvsmix 
fitfv = function(X,Z,fit){
  bhat = rowSums(fit$alpha*fit$mu)
  return(X %*% bhat + fit$mu.cov[1] + Z %*% fit$mu.cov[-1])
}

Core analysis

In [2]:
input_file = "../data/Thyroid.FMO2.1Mb.RDS"
dat = readRDS(input_file)
X = as.matrix(dat$X)
X = X[,which(colSums(X)!=0)]
storage.mode(X) <- "double"
y = as.vector(dat$y)
Z = as.matrix(dat$Z)
# univariate results
initial = univariate_regression(X, y ,Z)
mixsd = autoselect.mixsd(initial$betahat, initial$sebetahat)
mu_zero = matrix(0, ncol = length(mixsd)+1, nrow = ncol(X))
alpha_zero = matrix(1/ncol(X), ncol = length(mixsd)+1,nrow = ncol(X))
alpha_zero[,1] = 1 - length(mixsd) / ncol(X)
index_order = lasso_reorder(X, initial$residuals)
# bvsmix with lasso init vs random init
fit11 = varbvs::varbvsmix(X[, index_order], 
                         Z, y, sa = c(0,mixsd^2),
                         mu = mu_zero,
                         alpha = alpha_zero,
                         verbose = F)
fit12 = varbvs::varbvsmix(X, 
                         Z, y, sa = c(0,mixsd^2),
                         verbose = F)
# bvsmix with lasso init vs random init
fit21 = varbvs::varbvs(X[, index_order], Z, y, mu = mu_zero, alpha = alpha_zero, verbose = F)
fit22 = varbvs::varbvs(X, Z, y, verbose = F)

Check top hits with lasso init

In [3]:
mix.top = names(sort(fit11$lfsr)[1:5])
cbind(1 - fit11$lfsr[mix.top], fit21$alpha[,1][mix.top])
chr1_171172098_C_T_b381.0000000 1.000000e+00
chr1_171122735_A_G_b381.0000000 5.495719e-03
chr1_171190872_G_A_b380.4840977 1.914625e-05
chr1_171219393_T_A_b380.4827524 2.446432e-05
chr1_171178705_A_G_b380.4825416 2.025857e-05
In [4]:
bvs.top = names(sort(fit21$alpha[,1], decreasing=T)[1:5])
cbind(1 - fit11$lfsr[bvs.top], fit21$alpha[,1][bvs.top])
chr1_171172098_C_T_b381.0000000 0.99999999
chr1_171133158_A_G_b380.4573966 0.99999951
chr1_171199984_T_C_b380.4605322 0.99843195
chr1_171252314_G_C_b380.4608848 0.03125060
chr1_171150061_G_C_b380.4147678 0.01523159

Credible sets

Check if chr1_171122735_A_G_b38 is in credible set of varbvs result

In [5]:
get.credible.set <- function (bf, pr) {
  pp      <- bf/sum(bf)
  markers <- order(bf,decreasing = TRUE)
  n       <- min(which(cumsum(pp[markers]) > credint.prob))
  return(markers[1:n])
}
top.markers <- summary(fit21,nv = 20)$top.vars
top.markers <- subset(top.markers,prob > 0.95)$variable
m           <- length(top.markers)
R <- cor(X)
In [6]:
credint.prob <- 0.95
ld.threshold <- 0.2
cr           <- vector("list",m)
plots        <- vector("list",m)
names(cr)    <- top.markers
names(plots) <- top.markers
for (i in top.markers) {
  idx        <- which(colnames(X) == i)
  vars       <- which(R[,i]^2 >= ld.threshold)
  out        <- varbvs::varbvsproxybf(X,Z,y,fit21,idx,vars)
  cr[[i]]    <- vars[get.credible.set(out$BF[,1],credint.prob)]
  j          <- names(which.max(out$BF[,1]))
}
In [7]:
print(cr)
$chr1_171172098_C_T_b38
 chr1_171169344_G_A_b38  chr1_171168633_C_A_b38 chr1_171185504_AG_A_b38 
                   3824                    3821                    3903 
 chr1_171184438_A_G_b38  chr1_171172098_C_T_b38  chr1_171179725_T_A_b38 
                   3899                    3841                    3877 
 chr1_171176942_T_C_b38  chr1_171165549_G_C_b38 
                   3864                    3800 

$chr1_171133158_A_G_b38
 chr1_171115021_G_A_b38  chr1_171134102_T_C_b38  chr1_171133158_A_G_b38 
                   3576                    3650                    3646 
 chr1_171133705_G_A_b38  chr1_171132358_C_A_b38  chr1_171139697_C_T_b38 
                   3649                    3643                    3680 
 chr1_171114034_C_T_b38  chr1_171136655_G_A_b38  chr1_171135001_G_A_b38 
                   3568                    3664                    3655 
 chr1_171134913_G_A_b38  chr1_171127868_C_G_b38  chr1_171128489_C_A_b38 
                   3654                    3626                    3631 
chr1_171128484_A_AG_b38  chr1_171128486_A_G_b38 chr1_171116174_A_AT_b38 
                   3629                    3630                    3582 
 chr1_171135804_C_T_b38 
                   3659 

$chr1_171199984_T_C_b38
   chr1_171190872_G_A_b38    chr1_171164750_C_A_b38    chr1_171178705_A_G_b38 
                     3938                      3798                      3871 
   chr1_171198564_G_A_b38    chr1_171193299_C_T_b38 chr1_171198249_T_TACA_b38 
                     3998                      3963                      3994 
   chr1_171199984_T_C_b38   chr1_171196663_TG_T_b38    chr1_171193596_T_A_b38 
                     4012                      3987                      3967 
   chr1_171191559_T_C_b38   chr1_171190729_AT_A_b38    chr1_171189360_T_C_b38 
                     3946                      3937                      3930 
chr1_171193412_T_TGAC_b38    chr1_171192026_C_A_b38    chr1_171191344_A_T_b38 
                     3965                      3953                      3945 
   chr1_171190982_C_T_b38    chr1_171193444_T_C_b38    chr1_171174538_G_C_b38 
                     3940                      3966                      3852 

No, it is not ...

Narrow X to 100 SNPs

Next I'll try to come up with a smaller set of SNPs, say 100, that includes top hits from both methods along with random SNPs being the rest. Will see if there is still difference in behavior.

In [13]:
X2 = X[,unique(c(mix.top[1:2], bvs.top[1:3], sample(colnames(X), 100)))]
In [14]:
dim(X2)
  1. 574
  2. 104
In [18]:
initial = univariate_regression(X2, y ,Z)
mixsd = autoselect.mixsd(initial$betahat, initial$sebetahat)
mu_zero = matrix(0, ncol = length(mixsd)+1, nrow = ncol(X2))
alpha_zero = matrix(1/ncol(X2), ncol = length(mixsd)+1,nrow = ncol(X2))
alpha_zero[,1] = 1 - length(mixsd) / ncol(X2)
index_order = lasso_reorder(X2, initial$residuals)
# bvsmix with lasso init vs random init
fit31 = varbvs::varbvsmix(X2[, index_order], 
                         Z, y, sa = c(0,mixsd^2),
                         mu = mu_zero,
                         alpha = alpha_zero,
                         verbose = F)
fit32 = varbvs::varbvsmix(X2, 
                         Z, y, sa = c(0,mixsd^2),
                         verbose = F)
# bvsmix with lasso init vs random init
fit41 = varbvs::varbvs(X2[, index_order], Z, y, verbose = F)
fit42 = varbvs::varbvs(X2, Z, y, verbose = F)
In [22]:
mix.top2 = names(sort(fit31$lfsr)[1:5])
cbind(1 - fit31$lfsr[mix.top2], fit41$alpha[,1][mix.top2])
chr1_171172098_C_T_b381.0000000 1.00000000
chr1_171199984_T_C_b380.9999999 0.99999997
chr1_171122735_A_G_b380.9999943 1.00000000
chr1_171133158_A_G_b380.9951271 0.03176375
chr1_171397048_C_T_b380.7077927 0.07585271
In [23]:
bvs.top2 = names(sort(fit41$alpha[,1], decreasing=T)[1:5])
cbind(1 - fit31$lfsr[bvs.top2], fit41$alpha[,1][bvs.top2])
chr1_171172098_C_T_b381.0000000 1.00000000
chr1_171122735_A_G_b380.9999943 1.00000000
chr1_171199984_T_C_b380.9999999 0.99999997
chr1_171397048_C_T_b380.7077927 0.07585271
chr1_171133158_A_G_b380.9951271 0.03176375

Compare ~7.5K vs ~100 SNPs

varbvs

With 7.5K SNPs it identifies:

chr1_171172098_C_T_b38
chr1_171133158_A_G_b38
chr1_171199984_T_C_b38

and it missed one from varbvsmix:

chr1_171122735_A_G_b38

With 100 SNPs it now picks up the one previously missed according to varbvsmix:

chr1_171172098_C_T_b38  
chr1_171199984_T_C_b38  
chr1_171122735_A_G_b38

But dropped

chr1_171133158_A_G_b38

varbvsmix

With 7.5K SNPs it identifies

chr1_171172098_C_T_b38
chr1_171122735_A_G_b38

which overlaps with varbvs on only chr1_171172098_C_T_b38.

With 100 SNPs it now picks up 4 SNPs

chr1_171172098_C_T_b38  
chr1_171199984_T_C_b38  
chr1_171122735_A_G_b38
chr1_171133158_A_G_b38

Which is in fact the union of hits identified by both varbvsmix and varbvs when input is 7.5K SNPs

The top 4

In [24]:
R[c('chr1_171172098_C_T_b38', 'chr1_171199984_T_C_b38', 'chr1_171122735_A_G_b38', 'chr1_171133158_A_G_b38'),c('chr1_171172098_C_T_b38', 'chr1_171199984_T_C_b38', 'chr1_171122735_A_G_b38', 'chr1_171133158_A_G_b38')]
chr1_171172098_C_T_b38chr1_171199984_T_C_b38chr1_171122735_A_G_b38chr1_171133158_A_G_b38
chr1_171172098_C_T_b38 1.00000000-0.106125860.08283952 -0.16129847
chr1_171199984_T_C_b38-0.10612586 1.000000000.16520398 -0.04275903
chr1_171122735_A_G_b38 0.08283952 0.165203981.00000000 0.51658953
chr1_171133158_A_G_b38-0.16129847-0.042759030.51658953 1.00000000
In [26]:
# This is the top 4 markers identified with random varbvs init
markers3 <- c("chr1_171168633_C_A_b38",
              "chr1_171147265_C_A_b38",
              "chr1_171164750_C_A_b38",
              "chr1_171178589_C_T_b38")
R[markers3, markers3]
chr1_171168633_C_A_b38chr1_171147265_C_A_b38chr1_171164750_C_A_b38chr1_171178589_C_T_b38
chr1_171168633_C_A_b38 1.000000000.08365715 -0.1100207 -0.09399136
chr1_171147265_C_A_b38 0.083657151.00000000 0.1357039 0.25503481
chr1_171164750_C_A_b38-0.110020670.13570387 1.0000000 -0.11299131
chr1_171178589_C_T_b38-0.093991360.25503481 -0.1129913 1.00000000

Copyright © 2016-2020 Gao Wang et al at Stephens Lab, University of Chicago