In UKB bloodcells, we have about 600 regions with number of SNPs between 1000 and 5000.
[global]
parameter: cwd = path('/project2/mstephens/yuxin/ukb-bloodcells')
parameter: name = 'ukbbloodcells_prepare'
parameter: mixture_components = ['flash', 'pca', 'canonical']
%cd /project2/mstephens/yuxin/ukb-bloodcells
# extract data from summary stats
[extract_effects_1]
parameter: datadir = path
parameter: seed = 999
parameter: n_null = 2
# Analysis units file. For RDS files it can be generated by `ls *.rds | sed 's/\.rds//g' > analysis_units.txt`
parameter: analysis_units = path
# handle N = per_chunk data-set in one job
parameter: per_chunk = 1000
regions = [x.strip().split() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('#')]
input: [f'{datadir}/{x[0]}.rds' for x in regions], group_by = per_chunk
output: f"{cwd}/{name}/cache/{name}_{_index+1}.rds"
task: trunk_workers = 1, walltime = '1h', trunk_size = 1, mem = '4G', cores = 1, tags = f'{_output:bn}'
R: expand = "${ }"
set.seed(${seed})
matxMax <- function(mtx) {
return(arrayInd(which.max(mtx), dim(mtx)))
}
remove_rownames = function(x) {
for (name in names(x)) rownames(x[[name]]) = NULL
return(x)
}
handle_nan_etc = function(x) {
x$zhat[which(is.nan(x$bhat))] = 0
return(x)
}
extract_one_data = function(infile, n_null) {
# If cannot read the input for some reason then we just skip it, assuming we have other enough data-sets to use.
dat = tryCatch(readRDS(infile)$Z, error = function(e) return(NULL))
if (is.null(dat)) return(NULL)
dat = as.matrix(dat)
z = as.matrix(abs(dat))
max_idx = matxMax(z)
# strong effect samples
strong = list(zhat = dat[max_idx[1],,drop=F])
# null samples defined as |z| < 2
null.id = which(apply(abs(dat), 1, max) < 2)
if (length(null.id) == 0) {
warning(paste("Null data is empty for input file", infile))
null = list()
} else {
null_idx = sample(null.id, min(n_null, length(null.id)), replace = F)
null = list(zhat = dat[null_idx,,drop=F])
}
dat = (list(null = remove_rownames(null), strong = remove_rownames(strong)))
dat$null = handle_nan_etc(dat$null)
dat$strong = handle_nan_etc(dat$strong)
return(dat)
}
reformat_data = function(dat) {
# make output consistent in format with
# https://github.com/stephenslab/gtexresults/blob/master/workflows/mashr_flashr_workflow.ipynb
res = list(strong.z = dat$strong$zhat, null.z = dat$null$zhat)
return(res)
}
merge_data = function(res, one_data) {
if (length(res) == 0) {
return(one_data)
} else if (is.null(one_data)) {
return(res)
} else {
for (d in names(one_data)) {
if (is.null(one_data[[d]])) {
next
} else {
res[[d]] = rbind(res[[d]], one_data[[d]])
}
}
return(res)
}
}
res = list()
for (f in c(${_input:r,})) {
res = merge_data(res, reformat_data(extract_one_data(f, ${n_null})))
}
saveRDS(res, ${_output:r})
[extract_effects_2]
input: group_by = "all"
output: f"{cwd}/{name}.z.rds"
task: trunk_workers = 1, walltime = '1h', trunk_size = 1, mem = '16G', cores = 1, tags = f'{_output:bn}'
R: expand = "${ }"
merge_data = function(res, one_data) {
if (length(res) == 0) {
return(one_data)
} else {
for (d in names(one_data)) {
res[[d]] = rbind(res[[d]], one_data[[d]])
}
return(res)
}
}
dat = list()
for (f in c(${_input:r,})) {
dat = merge_data(dat, readRDS(f))
}
# compute null correlation matrix
dat$null.cor = cor(dat$null.z)
# compute empirical covariance XtX
dat$XtX = t(as.matrix(dat$strong.z)) %*% as.matrix(dat$strong.z) / nrow(dat$strong.z)
saveRDS(dat, ${_output:r})
To run it:
m=/project2/mstephens/yuxin/ukb-bloodcells/zscores
cd $m && ls *.rds | sed 's/\.rds//g' > analysis_units.txt && cd -
sos run /project2/mstephens/yuxin/mvarbvs/analysis/multivariate/20201221_ukb_ED_prior.ipynb extract_effects \
--analysis-units $m/analysis_units.txt \
--datadir $m &> extract_effects.log
[Ycov]
parameter: Ydata = 'bloodcells.pheno.resid.txt'
input: f"{cwd}/{name}.z.rds"
output: f"{cwd}/{name}.rds"
R: expand = "${ }"
library(data.table)
traits = fread('/project2/mstephens/yuxin/ukb-bloodcells/bloodcells.pheno.resid.txt')
Ycov = cov(traits[,3:18])
dat = readRDS(${_input:r})
dat$Ycov = Ycov
saveRDS(dat, ${_output:r})
To run it:
sos run /project2/mstephens/yuxin/mvarbvs/analysis/multivariate/20201221_ukb_ED_prior.ipynb Ycov \
&> Ycov.log
udr
or mashr
¶# Installed commit d6d4c0e
[ud]
# Method is `ed` or `teem`
parameter: ud_method = "ed"
parameter: residcor = 'Y'
# A list of models where we only update the scales and not the matrices
# A typical choice is to estimate scales only for canonical components
parameter: scale_only = []
input: [f"{cwd}/{name}.rds"] + [f"{cwd}/{name}.{m}.rds" for m in mixture_components]
output: f'{cwd}/{name}.{residcor}.{ud_method}.{"_unconstrained" if len(scale_only) == 0 else ""}.rds'
task: trunk_workers = 1, walltime = '36h', trunk_size = 1, mem = '10G', cores = 4, tags = f'{_output:bn}'
R: expand = "${ }", stderr = f"{_output:n}.stderr", stdout = f"{_output:n}.stdout"
scale_only = c(${paths(["%s/%s.%s.rds" % (cwd, name, m) for m in scale_only]):r,})
rds_files = c(${_input:r,})
dat = readRDS(rds_files[1])
U = list(XtX = dat$XtX)
U_scaled = list()
for (f in rds_files[2:length(rds_files)]) {
if (f %in% scale_only) {
U_scaled = c(U_scaled, readRDS(f))
} else {
U = c(U, readRDS(f))
}
}
# Fit mixture model using udr package
library(udr)
message(paste("Running ${ud_method.upper()} via udr package for", length(U), "mixture components"))
if (${residcor} == 'Y'){
V = cov2cor(dat$Ycov)
}else if (${residcor} == 'znull'){
V = dat$null.cor
}
f0 = ud_init(X = as.matrix(dat$strong.z), V = V, U_scaled = U_scaled, U_unconstrained = U, n_rank1=0)
res = ud_fit(f0, control = list(unconstrained.update = "${ud_method}", resid.update = 'none', scaled.update = "em", maxiter=5000, tol = 1e-06), verbose=TRUE)
# add back col and row names to U
# https://github.com/stephenslab/udr/issues/9
for (i in 1:length(res$U)) {
colnames(res$U[[i]]) = rownames(res$U[[i]]) = colnames(dat$strong.z)
}
saveRDS(list(U=res$U, w=res$w, loglik=res$loglik), ${_output:r})
[ed]
parameter: residcor = 'Y'
input: [f"{cwd}/{name}.rds"] + [f"{cwd}/{name}.{m}.rds" for m in mixture_components]
output: f"{cwd}/{name}.{residcor}.ed_bovy.rds"
task: trunk_workers = 1, walltime = '36h', trunk_size = 1, mem = '10G', cores = 4, tags = f'{_output:bn}'
R: expand = "${ }", stderr = f"{_output:n}.stderr", stdout = f"{_output:n}.stdout"
rds_files = c(${_input:r,})
dat = readRDS(rds_files[1])
U = list(XtX = dat$XtX)
for (f in rds_files[2:length(rds_files)]) U = c(U, readRDS(f))
if (${residcor} == 'Y'){
V = cov2cor(dat$Ycov)
}else if (${residcor} == 'znull'){
V = dat$null.cor
}
# Fit mixture model using ED code by J. Bovy
mash_data = mashr::mash_set_data(dat$strong.z, V=V)
message(paste("Running ED via J. Bovy's code for", length(U), "mixture components"))
res = mashr:::bovy_wrapper(mash_data, U, logfile=${_output:nr}, tol = 1e-06)
saveRDS(list(U=res$Ulist, w=res$pi, loglik=scan("${_output:n}_loglike.log")), ${_output:r})
[plot_U]
parameter: model_data = path
# number of components to show
parameter: max_comp = -1
# whether or not to convert to correlation
parameter: to_cor = False
parameter: tol = "1E-6"
parameter: remove_label = False
input: model_data
output: f'{cwd:a}/{_input:bn}{("_" + name.replace("$", "_")) if name != "" else ""}.pdf'
R: expand = "${ }"
plot_sharing = function(X, to_cor=FALSE, title="", remove_names=F) {
clrs <- colorRampPalette(rev(c("#D73027","#FC8D59","#FEE090","#FFFFBF",
"#E0F3F8","#91BFDB","#4575B4")))(128)
if (to_cor) lat <- cov2cor(X)
else lat = X/max(diag(X))
lat[lower.tri(lat)] <- NA
n <- nrow(lat)
if (remove_names) {
colnames(lat) = NULL
rownames(lat) = NULL
}
return(lattice::levelplot(lat[n:1,],col.regions = clrs,
xlab = "",ylab = "", main=title,
colorkey = TRUE,at = seq(-1,1,length.out = 128),
scales = list(cex = 0.6,x = list(rot = 45))))
}
dat = readRDS(${_input:r})
name = "${name}"
if (name != "") {
if (is.null(dat[[name]])) stop("Cannot find data ${name} in ${_input}")
dat = dat[[name]]
}
if (is.null(names(dat$U))) names(dat$U) = paste0("Comp_", 1:length(dat$U))
meta = data.frame(names(dat$U), dat$w, stringsAsFactors=F)
colnames(meta) = c("U", "w")
tol = ${tol}
n_comp = length(meta$U[which(dat$w>tol)])
meta = head(meta[order(meta[,2], decreasing = T),], ${max_comp if max_comp > 1 else "nrow(meta)"})
message(paste(n_comp, "components out of", length(dat$w), "total components have weight greater than", tol))
res = list()
for (i in 1:n_comp) {
title = paste(meta$U[i], "w =", round(meta$w[i], 6))
res[[i]] = plot_sharing(dat$U[[meta$U[i]]], to_cor = ${"T" if to_cor else "F"},
title=title, remove_names = ${"TRUE" if remove_label else "FALSE"})
}
unit = 4
n_col = 5
n_row = ceiling(n_comp / n_col)
pdf(${_output:r}, width = unit * n_col, height = unit * n_row)
do.call(gridExtra::grid.arrange, c(res, list(ncol = n_col, nrow = n_row,
bottom = "Data source: readRDS(${_input:br})${('$'+name) if name else ''}")))
dev.off()