Multivariate Bayesian variable selection regression

Benchmark with mthess and atlas on a small scale simulation

The benchmark

./fixed_mix.dsc --target mthess -o mthess_20200526 -s existing -e ignore -c 38 --n_dataset 200 &> mnm_20200526.log

This pipeline ran for 3 days with a couple of mthess instances dragging behind. I ended up having to terminate it and work with what we have.

In [ ]:
[global]
parameter: cwd = path('~/GIT/github/mvarbvs/dsc/mnm_prototype')
parameter: data_dir = path('mthess_20200526/')
parameter: date = '0526'
def fmtP(x):
    return str(x).replace(".", "p")

PIP extraction

I'll for now only extract file names and get the values as needed after.

sos run 20200530_mthess_Benchmark.ipynb pip:1
In [ ]:
# Extract meta info
[pip_1,roc_1,global_pip_1]
output: f'{cwd}/{data_dir}/PIP_comparison_{date}.pips_meta.rds'
R: expand = '${ }', workdir = cwd
    dat = dscrutils::dscquery(${data_dir:r}, target = c("small_data.dataset", "simulate", "method", "method.resid_method"), module.output.files=c("simulate", "method"), group = c("method: mnm_oracle mnm_naive mnm_identity mnm_shared mthess atlasqtl", "mnm:", "mnm_missing:", "simulate: artificial_mixture_small"), ignore.missing.file=T)
    print(dim(dat))
    # remove bad files
    bad_files = vector()
    for (f in dat$method.output.file) {
      if (!file.exists(paste0("${data_dir}/", f,'.rds'))) bad_files = append(bad_files, f)
    }
    dat = dat[which(!(dat$method.output.file %in% bad_files)),]
    print(dim(dat))
    dat$method_rename = NA
    dat$method_rename[which(!is.na(dat$method.resid_method))] = paste(dat$method[which(!is.na(dat$method.resid_method))], dat$method.resid_method[which(!is.na(dat$method.resid_method))], sep = '+')
    dat$method_rename[which(is.na(dat$method.resid_method))] = dat$method[which(is.na(dat$method.resid_method))]
    saveRDS(dat, ${_output:r})
In [1]:
%cd ~/GIT/github/mvarbvs/dsc/mnm_prototype/mthess_20200526
/home/gw/GIT/github/mvarbvs/dsc/mnm_prototype/mthess_20200526
In [3]:
dat = readRDS('PIP_comparison_0526.pips_meta.rds')
In [4]:
head(dat)
A data.frame: 6 × 8
DSCsmall_data.datasetsimulatemethodmethod.resid_methodsimulate.output.filemethod.output.filemethod_rename
<int><chr><chr><chr><chr><chr><chr><chr>
11~/Documents/GTExV8/Multi_Tissue_Toys/ENSG00000000003.14.Multi_Tissues.rdsartificial_mixture_smallmnm_oracleoracleartificial_mixture_small/small_data_1_artificial_mixture_small_1mnm_oracle/small_data_1_artificial_mixture_small_1_mnm_oracle_1mnm_oracle+oracle
21~/Documents/GTExV8/Multi_Tissue_Toys/ENSG00000000005.5.Multi_Tissues.rds artificial_mixture_smallmnm_oracleoracleartificial_mixture_small/small_data_2_artificial_mixture_small_1mnm_oracle/small_data_2_artificial_mixture_small_1_mnm_oracle_1mnm_oracle+oracle
31~/Documents/GTExV8/Multi_Tissue_Toys/ENSG00000000419.12.Multi_Tissues.rdsartificial_mixture_smallmnm_oracleoracleartificial_mixture_small/small_data_3_artificial_mixture_small_1mnm_oracle/small_data_3_artificial_mixture_small_1_mnm_oracle_1mnm_oracle+oracle
41~/Documents/GTExV8/Multi_Tissue_Toys/ENSG00000000457.13.Multi_Tissues.rdsartificial_mixture_smallmnm_oracleoracleartificial_mixture_small/small_data_4_artificial_mixture_small_1mnm_oracle/small_data_4_artificial_mixture_small_1_mnm_oracle_1mnm_oracle+oracle
51~/Documents/GTExV8/Multi_Tissue_Toys/ENSG00000000460.16.Multi_Tissues.rdsartificial_mixture_smallmnm_oracleoracleartificial_mixture_small/small_data_5_artificial_mixture_small_1mnm_oracle/small_data_5_artificial_mixture_small_1_mnm_oracle_1mnm_oracle+oracle
61~/Documents/GTExV8/Multi_Tissue_Toys/ENSG00000000938.12.Multi_Tissues.rdsartificial_mixture_smallmnm_oracleoracleartificial_mixture_small/small_data_6_artificial_mixture_small_1mnm_oracle/small_data_6_artificial_mixture_small_1_mnm_oracle_1mnm_oracle+oracle
In [5]:
unique(dat$simulate)
'artificial_mixture_small'
sos run 20200530_mthess_Benchmark.ipynb pip:1-2
sos run 20200530_mthess_Benchmark.ipynb global_pip:1-2
In [5]:
# Extract PIP
[pip_2,roc_2]
parameter: simulate_method = ['artificial_mixture_small']
parameter: subset = -1
input: for_each = 'simulate_method'
output: f'{cwd}/{data_dir}/PIP_comparison_{date}.{_simulate_method}{("." + str(subset)) if subset>0 else ""}.pips.rds'
R: expand = '${ }', workdir = f'{cwd}/{data_dir}'
    meta = readRDS(${_input:r})
    # apply some filters
    meta = meta[which(meta$simulate == "${_simulate_method}"),]
    if (${subset}<nrow(meta) && ${subset}>0) {
      set.seed(999)
      meta = meta[sample(1:nrow(meta))[1:${subset}],]
    }
    # now collect matrices for each method, of two columns: pip and true_coef
    res = list()
    for (i in 1:nrow(meta)) {
      true_coef = as.integer(readRDS(paste0(meta[i,]$simulate.output.file, '.rds'))$meta$true_coef != 0)
      # make it a vector
      true_coef = c(true_coef)
      method = meta[i,]$method_rename
      if (method %in% c("mthess", 'atlasqtl')) {
          if (method == 'atlasqtl') {
              pip = readRDS(paste0(meta[i,]$method.output.file, '.rds'))$result$gam_vb_completed
          } else {
              pip = readRDS(paste0(meta[i,]$method.output.file, '.rds'))$result$pip_conditions
          }
      } else {
          tmp = readRDS(paste0(meta[i,]$method.output.file, '.rds'))$result
          # approximate per condition PIP using condition specific 1 - lfsr
          pip = 1 - mvsusieR::mvsusie_get_lfsr(tmp)
      }
      # PIP is matrix of P (SNPs) by R (conditions); now make it a vector
      pip = c(pip)
      if (!(method %in% names(res))) {
        res[[method]] = list(pip = pip, truth = true_coef)
      } else {
        res[[method]]$pip = append(res[[method]]$pip, pip)
        res[[method]]$truth = append(res[[method]]$truth, true_coef)
      }
      if (i%%100==0) print(i)
    }
    for (method in unique(meta$method_rename)) {
      res[[method]] = do.call(cbind, res[[method]])
    }
    saveRDS(res, ${_output:r})
  
[global_pip_2]
parameter: simulate_method = ['artificial_mixture_small']
parameter: subset = -1
input: for_each = 'simulate_method'
output: f'{cwd}/{data_dir}/PIP_comparison_{date}.{_simulate_method}{("." + str(subset)) if subset>0 else ""}.global.pips.rds'
R: expand = '${ }', workdir = f'{cwd}/{data_dir}'
    meta = readRDS(${_input:r})
    # apply some filters
    meta = meta[which(meta$simulate == "${_simulate_method}"),]
    if (${subset}<nrow(meta) && ${subset}>0) {
      set.seed(999)
      meta = meta[sample(1:nrow(meta))[1:${subset}],]
    }
    # now collect matrices for each method, of two columns: pip and true_coef
    res = list()
    for (i in 1:nrow(meta)) {
      true_coef = as.integer(rowSums(readRDS(paste0(meta[i,]$simulate.output.file, '.rds'))$meta$true_coef) != 0)
      method = meta[i,]$method_rename
      if (method %in% c("mthess", 'atlasqtl')) {
          if (method == 'atlasqtl') {
              pip = apply(readRDS(paste0(meta[i,]$method.output.file, '.rds'))$result$gam_vb_completed, 1, sum)
          } else {
              pip = apply(readRDS(paste0(meta[i,]$method.output.file, '.rds'))$result$pip_conditions, 1, sum)
          }
      } else {
          pip = readRDS(paste0(meta[i,]$method.output.file, '.rds'))$result$pip
      }
      if (!(method %in% names(res))) {
        res[[method]] = list(pip = pip, truth = true_coef)
      } else {
        res[[method]]$pip = append(res[[method]]$pip, pip)
        res[[method]]$truth = append(res[[method]]$truth, true_coef)
      }
      if (i%%100==0) print(i)
    }
    for (method in unique(meta$method_rename)) {
      if (!is.null(res[[method]])) res[[method]] = do.call(cbind, res[[method]])
    }
    saveRDS(res, ${_output:r})

PIP calibration

sos run 20200530_mthess_Benchmark.ipynb pip:1-4
sos run 20200530_mthess_Benchmark.ipynb global_pip:1-4
In [15]:
# Calibration data
[pip_3,global_pip_3]
parameter: bin_size = 10
output: f'{_input:nn}.pip_evaluation.rds'
R: expand = '${ }', workdir = cwd
  dat = readRDS(${_input:r})
  bins = cbind(seq(1:${bin_size})/${bin_size}-1/${bin_size}, seq(1:${bin_size})/${bin_size})
  pip_cali = list()
  for (method in names(dat)) {
    pip_cali[[method]] = matrix(NA, nrow(bins), 3)
    for (i in 1:nrow(bins)) {
      data_in_bin = dat[[method]][which(dat[[method]][,1] > bins[i,1] & dat[[method]][,1] < bins[i,2]),]
      if(!is.null(dim(data_in_bin))) {
          pip_cali[[method]][i,1] = sum(data_in_bin[,1])
          pip_cali[[method]][i,2] = sum(data_in_bin[,2])
          pip_cali[[method]][i,3] = nrow(data_in_bin)
      } else {
        pip_cali[[method]][i,] = c(0,0,0) 
      }
    }
  }
  for (method in names(dat)) {
      pip_cali[[method]][,c(1,2)] = pip_cali[[method]][,c(1,2)] / pip_cali[[method]][,3]
  }
  saveRDS(pip_cali, ${_output:r})
In [16]:
# Calibration plot
[pip_4,global_pip_4]
depends: R_library('cowplot'), executable('convert')
output: f'{_input:n}.png'
R: expand = '${ }', workdir = cwd
    library(ggplot2)
    library(cowplot)
    rename = list('mnm_oracle+oracle' = 'Oracle prior and residual', 'mnm_oracle+flash' = 'Oracle prior', 'mnm_naive+oracle' = 'Default prior oracle residual', 
                  'mnm_naive+flash' = 'Default prior', 'mnm_ed+oracle' = 'EB prior oracle residual', 'mnm_ed+flash' = 'EB prior', 
                  'mnm_identity+oracle' = 'Random effects prior oracle residual', 'mnm_identity+flash' = 'Random effects prior',
                  'mnm_shared+oracle' = 'Fixed effect prior oracle residual', 'mnm_shared+flash' = 'Fixed effect prior', atlasqtl = 'atlasqtl', mthess = 'mthess')
    dot_plot = function(dataframe) {
        ggplot(dataframe, aes(x=mean_pip, y=observed_freq)) + 
          geom_errorbar(aes(ymin=observed_freq-se, ymax=observed_freq+se), colour="gray", size = 0.2, width=.01) +
          geom_point(size=1.5, shape=21, fill="#002b36") + # 21 is filled circle
          xlab("Mean PIP") +
          ylab("Observed frequency") +
          coord_cartesian(ylim=c(0,1), xlim=c(0,1)) +
          geom_abline(slope=1,intercept=0,colour='red', size=0.2) +
          ggtitle(rename[[name]]) +
          expand_limits(y=0) +                        # Expand y range
          theme_cowplot()
    }
    dat = readRDS(${_input:r})
    idx = 0
    for (name in names(dat)) {
      idx = idx + 1
      dat[[name]][,3] = sqrt(dat[[name]][,2] * (1 - dat[[name]][,2]) / dat[[name]][,3]) * 2
      dat[[name]] = as.data.frame(dat[[name]])
      colnames(dat[[name]]) = c("mean_pip", "observed_freq", "se")
      pdf(paste0(${_output:nr}, '_' , idx, '.pdf'), width=3, height=3, pointsize=16)
      print(dot_plot(dat[[name]]))
      dev.off()
      system(paste0("convert -density 120 ", ${_output:nr}, '_' , idx, '.pdf', " ", ${_output:nr}, '_' , idx, '.png'))
    }
    files = paste0(${_output:nr}, '_', seq(1:idx), '.png')
    cmd = paste('convert +append', paste(files, collapse=" "), ${_output:r})
    system(cmd)
    system(paste('rm -f', paste(files, collapse=" ")))

Effect specific PIP / LFSR calibration

In [8]:
%preview PIP_comparison_0526.artificial_mixture_small.pip_evaluation.png
> PIP_comparison_0526.artificial_mixture_small.pip_evaluation.png (64.8 KiB):

Global PIP calibration

In [2]:
%preview PIP_comparison_0526.artificial_mixture_small.global.pip_evaluation.png
> PIP_comparison_0526.artificial_mixture_small.global.pip_evaluation.png (65.7 KiB):

ROC / PR curves

sos run 20200530_mthess_Benchmark.ipynb roc -s build
sos run 20200530_mthess_Benchmark.ipynb roc --table roc --xlim 0.006 -s build
sos run 20200530_mthess_Benchmark.ipynb global_roc -s build
sos run 20200530_mthess_Benchmark.ipynb global_roc --table roc --xlim 0.006 -s build
In [21]:
# Data for ROC
[roc_3]
pip_cutoff = 0.05
output: f'{_input:nn}.roc.rds'
R: expand = '${ }', workdir = cwd
    roc_data = function(d1, cutoff = c(${pip_cutoff}, 0.999), connect_org = T) {
        grid = 1000
        ttv = seq(1:grid)/grid
        ttv = ttv[which(ttv>=cutoff[1] & ttv<=cutoff[2])]
        rst1 = t(sapply(ttv, function(x) c(sum(d1[,2][d1[,1]>=x]), length(d1[,2][d1[,1]>=x]), sum(d1[,2][d1[,1]>=x]==0))))
        rst1 = cbind(rst1, sum(d1[,2]), sum(1-d1[,2]))
        rst1 = as.data.frame(rst1)
        colnames(rst1) = c('true_positive', 'total_positive', 'false_positive', 'total_signal', 'total_null')
        rst2 = as.data.frame(cbind(rst1$true_positive / rst1$total_positive, rst1$true_positive / rst1$total_signal,  ttv))
        rst3 = as.data.frame(cbind(1 - rst1$false_positive / rst1$total_null, rst1$true_positive / rst1$total_signal,  ttv))
        if (connect_org) {
            # make a stair to origin
            rst2 = rbind(rst2, c(max(0.995, rst2[nrow(rst2),1]), max(rst2[nrow(rst2),2]-0.01, 0), rst2[nrow(rst2),3]))
            rst2 = rbind(rst2, c(1, 0, 1))
            rst3 = rbind(rst3, c(1, 0, 1))
        }
        colnames(rst2) = c('Precision', 'Recall', 'Threshold')
        colnames(rst3) = c('TN', 'TP', 'Threshold')
        return(list(counts = rst1, pr = rst2, roc = rst3))
    }
  
    print("Computing ROC data ...")
    roc = list()
    dat = readRDS(${_input:r})
    for (method in names(dat)) {
      print(method)
      roc[[method]] = roc_data(dat[[method]])
    }
    saveRDS(roc, ${_output:r})
In [27]:
dat = readRDS('PIP_comparison_0510.artificial_mixture.global.roc.rds')
names(dat)
  1. 'mnm_oracle+oracle'
  2. 'mnm_oracle+flash'
  3. 'mnm_naive+oracle'
  4. 'mnm_naive+flash'
  5. 'mnm_ed+oracle'
  6. 'mnm_ed+flash'
  7. 'mnm_identity+oracle'
  8. 'mnm_identity+flash'
  9. 'mnm_shared+oracle'
  10. 'mnm_shared+flash'
In [32]:
# Plot for ROC
[roc_4]
depends: R_library('scam')
parameter: chunks = 0
parameter: smooth = 'FALSE'
parameter: xlim = 0.8
parameter: ylim = 0.8
# Only plot for certain methods
parameter: filter_cond = "method %in% c('mnm_oracle+flash', 'mnm_naive+flash', 'mnm_ed+flash', 'mnm_identity+flash', 'mnm_shared+flash', 'atlasqtl', 'mthess')"
# "pr" or "roc" 
parameter: table = "pr" # or, `roc`
if table == "pr":
    main = "FDR vs Power"
    ylab = "power"
    xlab = "FDR"
else:
    main = "ROC curve"
    ylab = "True Positive"
    xlab = "False Positive"
opt = f"lwd = 2, xlim = c(0,{xlim}), ylim = c(0,{ylim})"
output: f'{_input:nn}.{table}.pdf'
R: expand = '${ }'
    colors = c('#A60628', '#7A68A6', '#348ABD', '#467821', '#FF0000', '#188487', '#E2A233',
                  '#A9A9A9', '#000000', '#FF00FF', '#FFD700', '#ADFF2F', '#00FFFF')
    dat = readRDS(${_input:r})
    library(scam)
    create_chunks = function(item, n) {
      splitted = suppressWarnings(split(item, 1:n))
      return(c(splitted[[1]], splitted[[length(splitted)]][length(splitted[[length(splitted)]])]))
    }
    make_smooth = function(x,y,subset=${chunks}, smooth = ${smooth}) {
      if (smooth) {
          if (subset < length(x) && subset > 0) {
              x = create_chunks(x, subset)
              y = create_chunks(y, subset)
          }
          dat = data.frame(cbind(x,y))
          colnames(dat) = c('x','y')
          y=predict(scam(y ~ s(x, bs = "mpi"), data = dat))
      }
      return(list(x=x,y=y))
    }
    add_text = function(thresholds, x, y, threshold, color, delta = -0.075) {
        idx = which(thresholds == threshold)
        text(x[idx] - delta * ${ylim}, y[idx], labels = threshold, col = color, cex=0.8)
        points(x[idx],y[idx])
    }
    rename = list('mnm_oracle+oracle' = 'Oracle prior and residual', 'mnm_oracle+flash' = 'Oracle prior', 'mnm_naive+oracle' = 'Default prior oracle residual', 
                  'mnm_naive+flash' = 'Default prior', 'mnm_ed+oracle' = 'EB prior oracle residual', 'mnm_ed+flash' = 'EB prior', 
                  'mnm_identity+oracle' = 'Random effects prior oracle residual', 'mnm_identity+flash' = 'Random effects prior',
                  'mnm_shared+oracle' = 'Fixed effect prior oracle residual', 'mnm_shared+flash' = 'Fixed effect prior', atlasqtl = 'atlasqtl', mthess = 'mthess')
    labels = vector()
    pdf(${_output:r}, width=10, height=10, pointsize=15)
    i = 1
    for (method in names(dat)) {
        if (${filter_cond}) {
            yy = make_smooth(1 - dat[[method]]$${table}[,1], dat[[method]]$${table}[,2])
            if (i == 1) {
                plot(yy$x, yy$y, t="l", col=colors[i], ylab = "${ylab}", xlab ="${xlab}", main = "${main}", bty='l', ${opt})
            } else {
                lines(yy$x, yy$y, col=colors[i], ${opt})
            }
            #add_text(dat[[method]]$${table}[,3], yy$x, yy$y, 0.9, colors[i])
            add_text(dat[[method]]$${table}[,3], yy$x, yy$y, 0.95, colors[i])
            labels[i] = rename[[method]]
            i = i + 1
      }
    }
    legend("bottomright", legend=labels, col=colors[1:i], lty=c(1,1,1), cex=0.8)
    dev.off()
[global_roc]
sos_run('global_pip:1+global_pip:2+roc:3+roc:4')

PR curve for condition specific lfsr

In [5]:
%preview PIP_comparison_0526.artificial_mixture_small.roc.pdf -s png
> PIP_comparison_0526.artificial_mixture_small.roc.pdf (14.6 KiB):
In [4]:
%preview PIP_comparison_0526.artificial_mixture_small.pr.pdf -s png
> PIP_comparison_0526.artificial_mixture_small.pr.pdf (15.7 KiB):
In [ ]:


Copyright © 2016-2020 Gao Wang et al at Stephens Lab, University of Chicago