Analyzing a toy example with `varbvs` and `mr-ash`¶

In this example we analyze a data set where there is potentially multiple SNPs associated with gene expression. We use both bvs and mr-ash methods, initializing both of them with results from lasso.

Utility functions¶

autoselect.mixsd = function(betahat,sebetahat,mult = sqrt(2)){
        # To avoid exact measure causing (usually by mistake)
        sebetahat = sebetahat[sebetahat!=0] 
        # so that the minimum is small compared with measurement precision
        sigmaamin = min(sebetahat)/10 
        if (all(betahat^2 <= sebetahat^2)) {
            # to deal with the occassional odd case where this could happen; 8 is arbitrary
            sigmaamax = 8*sigmaamin 
        } else {
            # this computes a rough largest value you'd want to use, 
            # based on idea that sigmaamax^2 + sebetahat^2 should be at least betahat^2   
            sigmaamax = 2*sqrt(max(betahat^2-sebetahat^2)) 
        }
        if(mult==0){
            return(c(0,sigmaamax/2))
        } else {
            npoint = ceiling(log2(sigmaamax/sigmaamin)/log2(mult))
            return(mult^((-npoint):0) * sigmaamax)
        }
    }
univariate_regression = function(X,y,Z = NULL){
        P = dim(X)[2]
        if (!is.null(Z)) {
            y = lm(y~Z)$residuals
        }
        output = matrix(0,nrow = P,ncol = 2)
        for(i in 1:P){
          g = summary(lm(y ~ X[,i]))
          output[i,] = g$coefficients[2,1:2]
        }
        return(list(betahat = output[,1], sebetahat = output[,2], 
                    residuals = y))
    }
lasso_reorder = function(X, y) {
        # perform lasso regression and reorder regressors by "importance"
        fit.glmnet <- glmnet::glmnet(X, y, intercept = F)
        beta_path = coef(fit.glmnet)[-1,]
        K = dim(beta_path)[2]
        path_order = c()
        for (k in 1:K) {
            crt_path = which(beta_path[,k] != 0)
            if (length(crt_path) != 0 & length(path_order) == 0) {
                path_order = c(path_order, crt_path)
            } else if(length(crt_path) != 0) {
                path_order = c(path_order, crt_path[-which(crt_path %in% path_order)] )
            }
        }
        path_order = unname(path_order)
        index_order = c(path_order, seq(1,dim(beta_path)[1])[-path_order])
        return(index_order)
    }

# compute fitted values from varbvsmix 
fitfv = function(X,Z,fit){
  bhat = rowSums(fit$alpha*fit$mu)
  return(X %*% bhat + fit$mu.cov[1] + Z %*% fit$mu.cov[-1])
}

Core analysis¶

input_file = "../data/Thyroid.FMO2.1Mb.RDS"
dat = readRDS(input_file)
X = as.matrix(dat$X)
X = X[,which(colSums(X)!=0)]
storage.mode(X) <- "double"
y = as.vector(dat$y)
Z = as.matrix(dat$Z)
# univariate results
initial = univariate_regression(X, y ,Z)
mixsd = autoselect.mixsd(initial$betahat, initial$sebetahat)
mu_zero = matrix(0, ncol = length(mixsd)+1, nrow = ncol(X))
alpha_zero = matrix(1/ncol(X), ncol = length(mixsd)+1,nrow = ncol(X))
alpha_zero[,1] = 1 - length(mixsd) / ncol(X)
index_order = lasso_reorder(X, initial$residuals)
# bvsmix with lasso init vs random init
fit11 = varbvs::varbvsmix(X[, index_order], 
                         Z, y, sa = c(0,mixsd^2),
                         mu = mu_zero,
                         alpha = alpha_zero,
                         verbose = F)
fit12 = varbvs::varbvsmix(X, 
                         Z, y, sa = c(0,mixsd^2),
                         verbose = F)
# bvsmix with lasso init vs random init
fit21 = varbvs::varbvs(X[, index_order], Z, y, mu = mu_zero, alpha = alpha_zero, verbose = F)
fit22 = varbvs::varbvs(X, Z, y, verbose = F)

Check top hits with lasso init¶

mix.top = names(sort(fit11$lfsr)[1:5])
cbind(1 - fit11$lfsr[mix.top], fit21$alpha[,1][mix.top])

bvs.top = names(sort(fit21$alpha[,1], decreasing=T)[1:5])
cbind(1 - fit11$lfsr[bvs.top], fit21$alpha[,1][bvs.top])

Credible sets¶

Check if chr1_171122735_A_G_b38 is in credible set of varbvs result

get.credible.set <- function (bf, pr) {
  pp      <- bf/sum(bf)
  markers <- order(bf,decreasing = TRUE)
  n       <- min(which(cumsum(pp[markers]) > credint.prob))
  return(markers[1:n])
}
top.markers <- summary(fit21,nv = 20)$top.vars
top.markers <- subset(top.markers,prob > 0.95)$variable
m           <- length(top.markers)
R <- cor(X)

credint.prob <- 0.95
ld.threshold <- 0.2
cr           <- vector("list",m)
plots        <- vector("list",m)
names(cr)    <- top.markers
names(plots) <- top.markers
for (i in top.markers) {
  idx        <- which(colnames(X) == i)
  vars       <- which(R[,i]^2 >= ld.threshold)
  out        <- varbvs::varbvsproxybf(X,Z,y,fit21,idx,vars)
  cr[[i]]    <- vars[get.credible.set(out$BF[,1],credint.prob)]
  j          <- names(which.max(out$BF[,1]))
}

print(cr)

$chr1_171172098_C_T_b38
 chr1_171169344_G_A_b38  chr1_171168633_C_A_b38 chr1_171185504_AG_A_b38 
                   3824                    3821                    3903 
 chr1_171184438_A_G_b38  chr1_171172098_C_T_b38  chr1_171179725_T_A_b38 
                   3899                    3841                    3877 
 chr1_171176942_T_C_b38  chr1_171165549_G_C_b38 
                   3864                    3800 

$chr1_171133158_A_G_b38
 chr1_171115021_G_A_b38  chr1_171134102_T_C_b38  chr1_171133158_A_G_b38 
                   3576                    3650                    3646 
 chr1_171133705_G_A_b38  chr1_171132358_C_A_b38  chr1_171139697_C_T_b38 
                   3649                    3643                    3680 
 chr1_171114034_C_T_b38  chr1_171136655_G_A_b38  chr1_171135001_G_A_b38 
                   3568                    3664                    3655 
 chr1_171134913_G_A_b38  chr1_171127868_C_G_b38  chr1_171128489_C_A_b38 
                   3654                    3626                    3631 
chr1_171128484_A_AG_b38  chr1_171128486_A_G_b38 chr1_171116174_A_AT_b38 
                   3629                    3630                    3582 
 chr1_171135804_C_T_b38 
                   3659 

$chr1_171199984_T_C_b38
   chr1_171190872_G_A_b38    chr1_171164750_C_A_b38    chr1_171178705_A_G_b38 
                     3938                      3798                      3871 
   chr1_171198564_G_A_b38    chr1_171193299_C_T_b38 chr1_171198249_T_TACA_b38 
                     3998                      3963                      3994 
   chr1_171199984_T_C_b38   chr1_171196663_TG_T_b38    chr1_171193596_T_A_b38 
                     4012                      3987                      3967 
   chr1_171191559_T_C_b38   chr1_171190729_AT_A_b38    chr1_171189360_T_C_b38 
                     3946                      3937                      3930 
chr1_171193412_T_TGAC_b38    chr1_171192026_C_A_b38    chr1_171191344_A_T_b38 
                     3965                      3953                      3945 
   chr1_171190982_C_T_b38    chr1_171193444_T_C_b38    chr1_171174538_G_C_b38 
                     3940                      3966                      3852

No, it is not ...

Narrow X to 100 SNPs¶

Next I'll try to come up with a smaller set of SNPs, say 100, that includes top hits from both methods along with random SNPs being the rest. Will see if there is still difference in behavior.

X2 = X[,unique(c(mix.top[1:2], bvs.top[1:3], sample(colnames(X), 100)))]

dim(X2)

initial = univariate_regression(X2, y ,Z)
mixsd = autoselect.mixsd(initial$betahat, initial$sebetahat)
mu_zero = matrix(0, ncol = length(mixsd)+1, nrow = ncol(X2))
alpha_zero = matrix(1/ncol(X2), ncol = length(mixsd)+1,nrow = ncol(X2))
alpha_zero[,1] = 1 - length(mixsd) / ncol(X2)
index_order = lasso_reorder(X2, initial$residuals)
# bvsmix with lasso init vs random init
fit31 = varbvs::varbvsmix(X2[, index_order], 
                         Z, y, sa = c(0,mixsd^2),
                         mu = mu_zero,
                         alpha = alpha_zero,
                         verbose = F)
fit32 = varbvs::varbvsmix(X2, 
                         Z, y, sa = c(0,mixsd^2),
                         verbose = F)
# bvsmix with lasso init vs random init
fit41 = varbvs::varbvs(X2[, index_order], Z, y, verbose = F)
fit42 = varbvs::varbvs(X2, Z, y, verbose = F)

mix.top2 = names(sort(fit31$lfsr)[1:5])
cbind(1 - fit31$lfsr[mix.top2], fit41$alpha[,1][mix.top2])

bvs.top2 = names(sort(fit41$alpha[,1], decreasing=T)[1:5])
cbind(1 - fit31$lfsr[bvs.top2], fit41$alpha[,1][bvs.top2])

Compare ~7.5K vs ~100 SNPs¶

varbvs¶

With 7.5K SNPs it identifies:

chr1_171172098_C_T_b38
chr1_171133158_A_G_b38
chr1_171199984_T_C_b38

and it missed one from varbvsmix:

chr1_171122735_A_G_b38

With 100 SNPs it now picks up the one previously missed according to varbvsmix:

chr1_171172098_C_T_b38  
chr1_171199984_T_C_b38  
chr1_171122735_A_G_b38

But dropped

chr1_171133158_A_G_b38

varbvsmix¶

With 7.5K SNPs it identifies

chr1_171172098_C_T_b38
chr1_171122735_A_G_b38

which overlaps with varbvs on only chr1_171172098_C_T_b38.

With 100 SNPs it now picks up 4 SNPs

chr1_171172098_C_T_b38  
chr1_171199984_T_C_b38  
chr1_171122735_A_G_b38
chr1_171133158_A_G_b38

Which is in fact the union of hits identified by both varbvsmix and varbvs when input is 7.5K SNPs

The top 4¶

R[c('chr1_171172098_C_T_b38', 'chr1_171199984_T_C_b38', 'chr1_171122735_A_G_b38', 'chr1_171133158_A_G_b38'),c('chr1_171172098_C_T_b38', 'chr1_171199984_T_C_b38', 'chr1_171122735_A_G_b38', 'chr1_171133158_A_G_b38')]

# This is the top 4 markers identified with random varbvs init
markers3 <- c("chr1_171168633_C_A_b38",
              "chr1_171147265_C_A_b38",
              "chr1_171164750_C_A_b38",
              "chr1_171178589_C_T_b38")
R[markers3, markers3]

chr1_171172098_C_T_b38	1.0000000	1.000000e+00
chr1_171122735_A_G_b38	1.0000000	5.495719e-03
chr1_171190872_G_A_b38	0.4840977	1.914625e-05
chr1_171219393_T_A_b38	0.4827524	2.446432e-05
chr1_171178705_A_G_b38	0.4825416	2.025857e-05

chr1_171172098_C_T_b38	1.0000000	0.99999999
chr1_171133158_A_G_b38	0.4573966	0.99999951
chr1_171199984_T_C_b38	0.4605322	0.99843195
chr1_171252314_G_C_b38	0.4608848	0.03125060
chr1_171150061_G_C_b38	0.4147678	0.01523159

chr1_171172098_C_T_b38	1.0000000	1.00000000
chr1_171199984_T_C_b38	0.9999999	0.99999997
chr1_171122735_A_G_b38	0.9999943	1.00000000
chr1_171133158_A_G_b38	0.9951271	0.03176375
chr1_171397048_C_T_b38	0.7077927	0.07585271

chr1_171172098_C_T_b38	1.0000000	1.00000000
chr1_171122735_A_G_b38	0.9999943	1.00000000
chr1_171199984_T_C_b38	0.9999999	0.99999997
chr1_171397048_C_T_b38	0.7077927	0.07585271
chr1_171133158_A_G_b38	0.9951271	0.03176375

	chr1_171172098_C_T_b38	chr1_171199984_T_C_b38	chr1_171122735_A_G_b38	chr1_171133158_A_G_b38
chr1_171172098_C_T_b38	1.00000000	-0.10612586	0.08283952	-0.16129847
chr1_171199984_T_C_b38	-0.10612586	1.00000000	0.16520398	-0.04275903
chr1_171122735_A_G_b38	0.08283952	0.16520398	1.00000000	0.51658953
chr1_171133158_A_G_b38	-0.16129847	-0.04275903	0.51658953	1.00000000

	chr1_171168633_C_A_b38	chr1_171147265_C_A_b38	chr1_171164750_C_A_b38	chr1_171178589_C_T_b38
chr1_171168633_C_A_b38	1.00000000	0.08365715	-0.1100207	-0.09399136
chr1_171147265_C_A_b38	0.08365715	1.00000000	0.1357039	0.25503481
chr1_171164750_C_A_b38	-0.11002067	0.13570387	1.0000000	-0.11299131
chr1_171178589_C_T_b38	-0.09399136	0.25503481	-0.1129913	1.00000000

Analyzing a toy example with varbvs and mr-ash¶