Multivariate Bayesian variable selection regression

UKB Blood Cells Prepare Data

Approach

  1. For each data-set, take the strongest snp as the strong set
  2. Also select from each data-set 2 "null" snps.
  3. then try to run your estimate of Vhat to get Vhat first, and run Yunqi / Peter's ED

In UKB bloodcells, we have about 600 regions with number of SNPs between 1000 and 5000.

Workflow

In [ ]:
[global]
parameter: cwd = path('/project2/mstephens/yuxin/ukb-bloodcells')
parameter: name = 'ukbbloodcells_prepare'
parameter: mixture_components = ['flash', 'pca', 'canonical']
In [1]:
%cd /project2/mstephens/yuxin/ukb-bloodcells
/project2/mstephens/gaow/mvarbvs/dsc/mnm_prototype/mnm_sumstats

Get top SNP and random close to null SNP per region

In [31]:
# extract data from summary stats
[extract_effects_1]
parameter: datadir = path
parameter: seed = 999
parameter: n_null = 2
# Analysis units file. For RDS files it can be generated by `ls *.rds | sed 's/\.rds//g' > analysis_units.txt`
parameter: analysis_units = path
# handle N = per_chunk data-set in one job
parameter: per_chunk = 1000
regions = [x.strip().split() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('#')]
input: [f'{datadir}/{x[0]}.rds' for x in regions], group_by = per_chunk
output: f"{cwd}/{name}/cache/{name}_{_index+1}.rds"
task: trunk_workers = 1, walltime = '1h', trunk_size = 1, mem = '4G', cores = 1, tags = f'{_output:bn}'
R: expand = "${ }"
    set.seed(${seed})
    matxMax <- function(mtx) {
      return(arrayInd(which.max(mtx), dim(mtx)))
    }
    remove_rownames = function(x) {
        for (name in names(x)) rownames(x[[name]]) = NULL
        return(x)
    }
    handle_nan_etc = function(x) {
      x$zhat[which(is.nan(x$bhat))] = 0
      return(x)
    }
    extract_one_data = function(infile, n_null) {
        # If cannot read the input for some reason then we just skip it, assuming we have other enough data-sets to use.
        dat = tryCatch(readRDS(infile)$Z, error = function(e) return(NULL))
        if (is.null(dat)) return(NULL)
        dat = as.matrix(dat)
        z = as.matrix(abs(dat))
        max_idx = matxMax(z)
        # strong effect samples
        strong = list(zhat = dat[max_idx[1],,drop=F])
        # null samples defined as |z| < 2
        null.id = which(apply(abs(dat), 1, max) < 2)
        if (length(null.id) == 0) {
          warning(paste("Null data is empty for input file", infile))
          null = list()
        } else {
          null_idx = sample(null.id, min(n_null, length(null.id)), replace = F)
          null = list(zhat = dat[null_idx,,drop=F])
        }
        dat = (list(null = remove_rownames(null), strong = remove_rownames(strong)))
        dat$null = handle_nan_etc(dat$null)
        dat$strong = handle_nan_etc(dat$strong)
        return(dat)
    }
    reformat_data = function(dat) {
        # make output consistent in format with 
        # https://github.com/stephenslab/gtexresults/blob/master/workflows/mashr_flashr_workflow.ipynb      
        res = list(strong.z = dat$strong$zhat, null.z = dat$null$zhat)
      return(res)
    }
    merge_data = function(res, one_data) {
      if (length(res) == 0) {
          return(one_data)
      } else if (is.null(one_data)) {
          return(res)
      } else {
          for (d in names(one_data)) {
            if (is.null(one_data[[d]])) {
              next
            } else {
                res[[d]] = rbind(res[[d]], one_data[[d]])
            }
          }
          return(res)
      }
    }
    res = list()
    for (f in c(${_input:r,})) {
      res = merge_data(res, reformat_data(extract_one_data(f, ${n_null})))
    }
    saveRDS(res, ${_output:r})

[extract_effects_2]
input: group_by = "all"
output: f"{cwd}/{name}.z.rds"
task: trunk_workers = 1, walltime = '1h', trunk_size = 1, mem = '16G', cores = 1, tags = f'{_output:bn}'
R: expand = "${ }"
    merge_data = function(res, one_data) {
      if (length(res) == 0) {
          return(one_data)
      } else {
          for (d in names(one_data)) {
            res[[d]] = rbind(res[[d]], one_data[[d]])
          }
          return(res)
      }
    }
    dat = list()
    for (f in c(${_input:r,})) {
      dat = merge_data(dat, readRDS(f))
    }
    # compute null correlation matrix
    dat$null.cor = cor(dat$null.z)
    # compute empirical covariance XtX
    dat$XtX = t(as.matrix(dat$strong.z)) %*% as.matrix(dat$strong.z) / nrow(dat$strong.z)
    
                       
    saveRDS(dat, ${_output:r})

To run it:

m=/project2/mstephens/yuxin/ukb-bloodcells/zscores
cd $m && ls *.rds | sed 's/\.rds//g' > analysis_units.txt && cd -
sos run /project2/mstephens/yuxin/mvarbvs/analysis/multivariate/20201221_ukb_ED_prior.ipynb extract_effects \
        --analysis-units $m/analysis_units.txt \
        --datadir $m &> extract_effects.log

Residual Covariance from Y

In [ ]:
[Ycov]
parameter: Ydata = 'bloodcells.pheno.resid.txt'
input: f"{cwd}/{name}.z.rds"
output: f"{cwd}/{name}.rds"
R: expand = "${ }"
    library(data.table)
    traits = fread('/project2/mstephens/yuxin/ukb-bloodcells/bloodcells.pheno.resid.txt')
    Ycov = cov(traits[,3:18])
    dat = readRDS(${_input:r})
    dat$Ycov = Ycov
    saveRDS(dat, ${_output:r})

To run it:

sos run /project2/mstephens/yuxin/mvarbvs/analysis/multivariate/20201221_ukb_ED_prior.ipynb Ycov \
    &> Ycov.log

Run extreme deconvolution using udr or mashr

In [ ]:
# Installed commit d6d4c0e
[ud]
# Method is `ed` or `teem`
parameter: ud_method = "ed"
parameter: residcor = 'Y'
# A list of models where we only update the scales and not the matrices
# A typical choice is to estimate scales only for canonical components
parameter: scale_only = []
input: [f"{cwd}/{name}.rds"] + [f"{cwd}/{name}.{m}.rds" for m in mixture_components]
output: f'{cwd}/{name}.{residcor}.{ud_method}.{"_unconstrained" if len(scale_only) == 0 else ""}.rds'
task: trunk_workers = 1, walltime = '36h', trunk_size = 1, mem = '10G', cores = 4, tags = f'{_output:bn}'
R: expand = "${ }", stderr = f"{_output:n}.stderr", stdout = f"{_output:n}.stdout"
    scale_only = c(${paths(["%s/%s.%s.rds" % (cwd, name, m) for m in scale_only]):r,})
    rds_files = c(${_input:r,})
    dat = readRDS(rds_files[1])
    U = list(XtX = dat$XtX)
    U_scaled = list()
    for (f in rds_files[2:length(rds_files)]) {
        if (f %in% scale_only) {
          U_scaled = c(U_scaled, readRDS(f))
        } else {
          U = c(U, readRDS(f))
        }
    }
    # Fit mixture model using udr package
    library(udr)
    message(paste("Running ${ud_method.upper()} via udr package for", length(U), "mixture components"))
    if (${residcor} == 'Y'){
        V = cov2cor(dat$Ycov)
    }else if (${residcor} == 'znull'){
        V = dat$null.cor
    }
    f0 = ud_init(X = as.matrix(dat$strong.z), V = V, U_scaled = U_scaled, U_unconstrained = U, n_rank1=0)
    res = ud_fit(f0, control = list(unconstrained.update = "${ud_method}", resid.update = 'none', scaled.update = "em", maxiter=5000, tol = 1e-06), verbose=TRUE)
    # add back col and row names to U
    # https://github.com/stephenslab/udr/issues/9
    for (i in 1:length(res$U)) {
        colnames(res$U[[i]]) = rownames(res$U[[i]]) = colnames(dat$strong.z)
    }
    saveRDS(list(U=res$U, w=res$w, loglik=res$loglik), ${_output:r})
In [ ]:
[ed]
parameter: residcor = 'Y'
input: [f"{cwd}/{name}.rds"] + [f"{cwd}/{name}.{m}.rds" for m in mixture_components]
output: f"{cwd}/{name}.{residcor}.ed_bovy.rds"
task: trunk_workers = 1, walltime = '36h', trunk_size = 1, mem = '10G', cores = 4, tags = f'{_output:bn}'
R: expand = "${ }", stderr = f"{_output:n}.stderr", stdout = f"{_output:n}.stdout"
    rds_files = c(${_input:r,})
    dat = readRDS(rds_files[1])
    U = list(XtX = dat$XtX)
    for (f in rds_files[2:length(rds_files)]) U = c(U, readRDS(f))
    if (${residcor} == 'Y'){
        V = cov2cor(dat$Ycov)
    }else if (${residcor} == 'znull'){
        V = dat$null.cor
    }
    # Fit mixture model using ED code by J. Bovy
    mash_data = mashr::mash_set_data(dat$strong.z, V=V)
    message(paste("Running ED via J. Bovy's code for", length(U), "mixture components"))
    res = mashr:::bovy_wrapper(mash_data, U, logfile=${_output:nr}, tol = 1e-06)
    saveRDS(list(U=res$Ulist, w=res$pi, loglik=scan("${_output:n}_loglike.log")), ${_output:r})

Plot patterns of sharing

In [ ]:
[plot_U]
parameter: model_data = path
# number of components to show
parameter: max_comp = -1
# whether or not to convert to correlation
parameter: to_cor = False
parameter: tol = "1E-6"
parameter: remove_label = False
input: model_data
output: f'{cwd:a}/{_input:bn}{("_" + name.replace("$", "_")) if name != "" else ""}.pdf'
R: expand = "${ }"
    plot_sharing = function(X, to_cor=FALSE, title="", remove_names=F) {
        clrs <- colorRampPalette(rev(c("#D73027","#FC8D59","#FEE090","#FFFFBF",
                                       "#E0F3F8","#91BFDB","#4575B4")))(128)
        if (to_cor) lat <- cov2cor(X)
        else lat = X/max(diag(X))
        lat[lower.tri(lat)] <- NA
        n <- nrow(lat)
        if (remove_names) {
          colnames(lat) = NULL
          rownames(lat) = NULL
        }
        return(lattice::levelplot(lat[n:1,],col.regions = clrs,
                                xlab = "",ylab = "", main=title,
                                colorkey = TRUE,at = seq(-1,1,length.out = 128),
                                scales = list(cex = 0.6,x = list(rot = 45))))
    }
  
    dat = readRDS(${_input:r})
    name = "${name}"
    if (name != "") {
      if (is.null(dat[[name]])) stop("Cannot find data ${name} in ${_input}")
        dat = dat[[name]]
    }
    if (is.null(names(dat$U))) names(dat$U) = paste0("Comp_", 1:length(dat$U))
    meta = data.frame(names(dat$U), dat$w, stringsAsFactors=F)
    colnames(meta) = c("U", "w")
    tol = ${tol}
    n_comp = length(meta$U[which(dat$w>tol)])
    meta = head(meta[order(meta[,2], decreasing = T),], ${max_comp if max_comp > 1 else "nrow(meta)"})
    message(paste(n_comp, "components out of", length(dat$w), "total components have weight greater than", tol))
    res = list()
    for (i in 1:n_comp) {
        title = paste(meta$U[i], "w =", round(meta$w[i], 6))
        res[[i]] = plot_sharing(dat$U[[meta$U[i]]], to_cor = ${"T" if to_cor else "F"}, 
                                title=title, remove_names = ${"TRUE" if remove_label else "FALSE"})
    }
    unit = 4
    n_col = 5
    n_row = ceiling(n_comp / n_col)
    pdf(${_output:r}, width = unit * n_col, height = unit * n_row)
    do.call(gridExtra::grid.arrange, c(res, list(ncol = n_col, nrow = n_row, 
                                                 bottom = "Data source: readRDS(${_input:br})${('$'+name) if name else ''}")))
    dev.off()

Copyright © 2016-2020 Gao Wang et al at Stephens Lab, University of Chicago