UKB Blood Cells Prepare Data¶

Approach¶

For each data-set, take the strongest snp as the strong set
Also select from each data-set 2 "null" snps.
then try to run your estimate of Vhat to get Vhat first, and run Yunqi / Peter's ED

In UKB bloodcells, we have about 600 regions with number of SNPs between 1000 and 5000.

Workflow¶

[global]
parameter: cwd = path('/project2/mstephens/yuxin/ukb-bloodcells')
parameter: name = 'ukbbloodcells_prepare'
parameter: mixture_components = ['flash', 'pca', 'canonical']

%cd /project2/mstephens/yuxin/ukb-bloodcells

/project2/mstephens/gaow/mvarbvs/dsc/mnm_prototype/mnm_sumstats

Get top SNP and random close to null SNP per region¶

# extract data from summary stats
[extract_effects_1]
parameter: datadir = path
parameter: seed = 999
parameter: n_null = 2
# Analysis units file. For RDS files it can be generated by `ls *.rds | sed 's/\.rds//g' > analysis_units.txt`
parameter: analysis_units = path
# handle N = per_chunk data-set in one job
parameter: per_chunk = 1000
regions = [x.strip().split() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('#')]
input: [f'{datadir}/{x[0]}.rds' for x in regions], group_by = per_chunk
output: f"{cwd}/{name}/cache/{name}_{_index+1}.rds"
task: trunk_workers = 1, walltime = '1h', trunk_size = 1, mem = '4G', cores = 1, tags = f'{_output:bn}'
R: expand = "${ }"
    set.seed(${seed})
    matxMax <- function(mtx) {
      return(arrayInd(which.max(mtx), dim(mtx)))
    }
    remove_rownames = function(x) {
        for (name in names(x)) rownames(x[[name]]) = NULL
        return(x)
    }
    handle_nan_etc = function(x) {
      x$zhat[which(is.nan(x$bhat))] = 0
      return(x)
    }
    extract_one_data = function(infile, n_null) {
        # If cannot read the input for some reason then we just skip it, assuming we have other enough data-sets to use.
        dat = tryCatch(readRDS(infile)$Z, error = function(e) return(NULL))
        if (is.null(dat)) return(NULL)
        dat = as.matrix(dat)
        z = as.matrix(abs(dat))
        max_idx = matxMax(z)
        # strong effect samples
        strong = list(zhat = dat[max_idx[1],,drop=F])
        # null samples defined as |z| < 2
        null.id = which(apply(abs(dat), 1, max) < 2)
        if (length(null.id) == 0) {
          warning(paste("Null data is empty for input file", infile))
          null = list()
        } else {
          null_idx = sample(null.id, min(n_null, length(null.id)), replace = F)
          null = list(zhat = dat[null_idx,,drop=F])
        }
        dat = (list(null = remove_rownames(null), strong = remove_rownames(strong)))
        dat$null = handle_nan_etc(dat$null)
        dat$strong = handle_nan_etc(dat$strong)
        return(dat)
    }
    reformat_data = function(dat) {
        # make output consistent in format with 
        # https://github.com/stephenslab/gtexresults/blob/master/workflows/mashr_flashr_workflow.ipynb      
        res = list(strong.z = dat$strong$zhat, null.z = dat$null$zhat)
      return(res)
    }
    merge_data = function(res, one_data) {
      if (length(res) == 0) {
          return(one_data)
      } else if (is.null(one_data)) {
          return(res)
      } else {
          for (d in names(one_data)) {
            if (is.null(one_data[[d]])) {
              next
            } else {
                res[[d]] = rbind(res[[d]], one_data[[d]])
            }
          }
          return(res)
      }
    }
    res = list()
    for (f in c(${_input:r,})) {
      res = merge_data(res, reformat_data(extract_one_data(f, ${n_null})))
    }
    saveRDS(res, ${_output:r})

[extract_effects_2]
input: group_by = "all"
output: f"{cwd}/{name}.z.rds"
task: trunk_workers = 1, walltime = '1h', trunk_size = 1, mem = '16G', cores = 1, tags = f'{_output:bn}'
R: expand = "${ }"
    merge_data = function(res, one_data) {
      if (length(res) == 0) {
          return(one_data)
      } else {
          for (d in names(one_data)) {
            res[[d]] = rbind(res[[d]], one_data[[d]])
          }
          return(res)
      }
    }
    dat = list()
    for (f in c(${_input:r,})) {
      dat = merge_data(dat, readRDS(f))
    }
    # compute null correlation matrix
    dat$null.cor = cor(dat$null.z)
    # compute empirical covariance XtX
    dat$XtX = t(as.matrix(dat$strong.z)) %*% as.matrix(dat$strong.z) / nrow(dat$strong.z)
    
                       
    saveRDS(dat, ${_output:r})

To run it:

m=/project2/mstephens/yuxin/ukb-bloodcells/zscores
cd $m && ls *.rds | sed 's/\.rds//g' > analysis_units.txt && cd -
sos run /project2/mstephens/yuxin/mvarbvs/analysis/multivariate/20201221_ukb_ED_prior.ipynb extract_effects \
        --analysis-units $m/analysis_units.txt \
        --datadir $m &> extract_effects.log

Residual Covariance from Y¶

[Ycov]
parameter: Ydata = 'bloodcells.pheno.resid.txt'
input: f"{cwd}/{name}.z.rds"
output: f"{cwd}/{name}.rds"
R: expand = "${ }"
    library(data.table)
    traits = fread('/project2/mstephens/yuxin/ukb-bloodcells/bloodcells.pheno.resid.txt')
    Ycov = cov(traits[,3:18])
    dat = readRDS(${_input:r})
    dat$Ycov = Ycov
    saveRDS(dat, ${_output:r})

To run it:

sos run /project2/mstephens/yuxin/mvarbvs/analysis/multivariate/20201221_ukb_ED_prior.ipynb Ycov \
    &> Ycov.log

Run extreme deconvolution using `udr` or `mashr`¶

# Installed commit d6d4c0e
[ud]
# Method is `ed` or `teem`
parameter: ud_method = "ed"
parameter: residcor = 'Y'
# A list of models where we only update the scales and not the matrices
# A typical choice is to estimate scales only for canonical components
parameter: scale_only = []
input: [f"{cwd}/{name}.rds"] + [f"{cwd}/{name}.{m}.rds" for m in mixture_components]
output: f'{cwd}/{name}.{residcor}.{ud_method}.{"_unconstrained" if len(scale_only) == 0 else ""}.rds'
task: trunk_workers = 1, walltime = '36h', trunk_size = 1, mem = '10G', cores = 4, tags = f'{_output:bn}'
R: expand = "${ }", stderr = f"{_output:n}.stderr", stdout = f"{_output:n}.stdout"
    scale_only = c(${paths(["%s/%s.%s.rds" % (cwd, name, m) for m in scale_only]):r,})
    rds_files = c(${_input:r,})
    dat = readRDS(rds_files[1])
    U = list(XtX = dat$XtX)
    U_scaled = list()
    for (f in rds_files[2:length(rds_files)]) {
        if (f %in% scale_only) {
          U_scaled = c(U_scaled, readRDS(f))
        } else {
          U = c(U, readRDS(f))
        }
    }
    # Fit mixture model using udr package
    library(udr)
    message(paste("Running ${ud_method.upper()} via udr package for", length(U), "mixture components"))
    if (${residcor} == 'Y'){
        V = cov2cor(dat$Ycov)
    }else if (${residcor} == 'znull'){
        V = dat$null.cor
    }
    f0 = ud_init(X = as.matrix(dat$strong.z), V = V, U_scaled = U_scaled, U_unconstrained = U, n_rank1=0)
    res = ud_fit(f0, control = list(unconstrained.update = "${ud_method}", resid.update = 'none', scaled.update = "em", maxiter=5000, tol = 1e-06), verbose=TRUE)
    # add back col and row names to U
    # https://github.com/stephenslab/udr/issues/9
    for (i in 1:length(res$U)) {
        colnames(res$U[[i]]) = rownames(res$U[[i]]) = colnames(dat$strong.z)
    }
    saveRDS(list(U=res$U, w=res$w, loglik=res$loglik), ${_output:r})

[ed]
parameter: residcor = 'Y'
input: [f"{cwd}/{name}.rds"] + [f"{cwd}/{name}.{m}.rds" for m in mixture_components]
output: f"{cwd}/{name}.{residcor}.ed_bovy.rds"
task: trunk_workers = 1, walltime = '36h', trunk_size = 1, mem = '10G', cores = 4, tags = f'{_output:bn}'
R: expand = "${ }", stderr = f"{_output:n}.stderr", stdout = f"{_output:n}.stdout"
    rds_files = c(${_input:r,})
    dat = readRDS(rds_files[1])
    U = list(XtX = dat$XtX)
    for (f in rds_files[2:length(rds_files)]) U = c(U, readRDS(f))
    if (${residcor} == 'Y'){
        V = cov2cor(dat$Ycov)
    }else if (${residcor} == 'znull'){
        V = dat$null.cor
    }
    # Fit mixture model using ED code by J. Bovy
    mash_data = mashr::mash_set_data(dat$strong.z, V=V)
    message(paste("Running ED via J. Bovy's code for", length(U), "mixture components"))
    res = mashr:::bovy_wrapper(mash_data, U, logfile=${_output:nr}, tol = 1e-06)
    saveRDS(list(U=res$Ulist, w=res$pi, loglik=scan("${_output:n}_loglike.log")), ${_output:r})

Plot patterns of sharing¶

[plot_U]
parameter: model_data = path
# number of components to show
parameter: max_comp = -1
# whether or not to convert to correlation
parameter: to_cor = False
parameter: tol = "1E-6"
parameter: remove_label = False
input: model_data
output: f'{cwd:a}/{_input:bn}{("_" + name.replace("$", "_")) if name != "" else ""}.pdf'
R: expand = "${ }"
    plot_sharing = function(X, to_cor=FALSE, title="", remove_names=F) {
        clrs <- colorRampPalette(rev(c("#D73027","#FC8D59","#FEE090","#FFFFBF",
                                       "#E0F3F8","#91BFDB","#4575B4")))(128)
        if (to_cor) lat <- cov2cor(X)
        else lat = X/max(diag(X))
        lat[lower.tri(lat)] <- NA
        n <- nrow(lat)
        if (remove_names) {
          colnames(lat) = NULL
          rownames(lat) = NULL
        }
        return(lattice::levelplot(lat[n:1,],col.regions = clrs,
                                xlab = "",ylab = "", main=title,
                                colorkey = TRUE,at = seq(-1,1,length.out = 128),
                                scales = list(cex = 0.6,x = list(rot = 45))))
    }
  
    dat = readRDS(${_input:r})
    name = "${name}"
    if (name != "") {
      if (is.null(dat[[name]])) stop("Cannot find data ${name} in ${_input}")
        dat = dat[[name]]
    }
    if (is.null(names(dat$U))) names(dat$U) = paste0("Comp_", 1:length(dat$U))
    meta = data.frame(names(dat$U), dat$w, stringsAsFactors=F)
    colnames(meta) = c("U", "w")
    tol = ${tol}
    n_comp = length(meta$U[which(dat$w>tol)])
    meta = head(meta[order(meta[,2], decreasing = T),], ${max_comp if max_comp > 1 else "nrow(meta)"})
    message(paste(n_comp, "components out of", length(dat$w), "total components have weight greater than", tol))
    res = list()
    for (i in 1:n_comp) {
        title = paste(meta$U[i], "w =", round(meta$w[i], 6))
        res[[i]] = plot_sharing(dat$U[[meta$U[i]]], to_cor = ${"T" if to_cor else "F"}, 
                                title=title, remove_names = ${"TRUE" if remove_label else "FALSE"})
    }
    unit = 4
    n_col = 5
    n_row = ceiling(n_comp / n_col)
    pdf(${_output:r}, width = unit * n_col, height = unit * n_row)
    do.call(gridExtra::grid.arrange, c(res, list(ncol = n_col, nrow = n_row, 
                                                 bottom = "Data source: readRDS(${_input:br})${('$'+name) if name else ''}")))
    dev.off()

UKB Blood Cells Prepare Data¶

Approach¶

Workflow¶

Get top SNP and random close to null SNP per region¶

Residual Covariance from Y¶

Run extreme deconvolution using udr or mashr¶

Plot patterns of sharing¶

Run extreme deconvolution using `udr` or `mashr`¶