A first (comprehensive) set of simulations to learn properties of the new fine-mapping method.
%revisions -s -n 10
As a first pass, we run susie
on simulations of 100 genes, with
We characterize 95% confident sets (CS) it produces in terms of
and, to compare with other methods
We run Susie with
estimate_residual_variance
: TRUE and FALSEprior_variance
: 5-40%We should expect that results will be reasonably robust to these choices.
%cd ~/GIT/github/mvarbvs/dsc
dsc susie.dsc --target run_susie -c 20
get_combined = function(sub, dirname, ld_col, ld_cutoff = 0.2, lfsr_cutoff = 0.05) {
out_files = sub[,c("fit_susie.output.file", "plot_susie.output.file")]
combined = list(purity = NULL, lfsr = NULL, size = NULL,
captures = NULL, total_captures = NULL, pip = NULL)
for (i in 1:nrow(out_files)) {
fit = readRDS(paste0(dirname, out_files[i,1], '.rds'))$posterior
purity = readRDS(paste0(dirname, out_files[i,2], '.rds'))
#
if (is.null(combined$purity)) combined$purity = purity$purity$V1[,ld_col]
else combined$purity = cbind(combined$purity, purity$purity$V1[,ld_col])
#
if (is.null(combined$size)) combined$size = fit$n_in_CI[,1]
else combined$size = cbind(combined$size, fit$n_in_CI[,1])
#
if (is.null(combined$lfsr)) combined$lfsr = fit$set_lfsr[,1]
else combined$lfsr = cbind(combined$lfsr, fit$set_lfsr[,1])
#
alpha = fit$alpha[which(purity$purity$V1[,ld_col] > ld_cutoff),,drop=FALSE]
pip = t(1 - apply(1 - alpha, 2, prod))
if (is.null(combined$pip)) combined$pip = pip
else combined$pip = cbind(combined$pip, pip)
#
if (is.null(combined$captures)) combined$captures = rowSums(purity$signal$V1)
else combined$captures = cbind(combined$captures, rowSums(purity$signal$V1))
#
detected = apply(t(purity$signal$V1[which(fit$set_lfsr[,1] < lfsr_cutoff),,drop=FALSE]), 1, sum)
if (is.null(combined$total_captures)) combined$total_captures = detected
else combined$total_captures = combined$total_captures + detected
}
return(combined)
}
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='ticks', context='talk')
COLORS = ['#348ABD', '#7A68A6', '#A60628', '#467821', '#FF0000', '#188487', '#E2A233',
'#A9A9A9', '#000000', '#FF00FF', '#FFD700', '#ADFF2F', '#00FFFF']
color_mapper = np.vectorize(lambda x: dict([(i,j) for i,j in enumerate(COLORS)]).get(x))
def plot_purity(data, output, lfsr_cutoff = 0.05):
purity = np.array(data['purity'])
lfsr = np.array(data['lfsr'])
size = np.array(data['size'])
capture = np.array(data['captures'])
capture_summary = [f"Signal {idx+1} captured {item}/{purity.shape[1]} times" for idx, item in enumerate(data['total_captures'])]
idx = 0
plt.figure(figsize=(12, 8))
L = purity.shape[0]
cols = 3
rows = L // cols + L % cols
position = range(1,L + 1)
for x, y, z, c in zip(size, purity, lfsr, capture):
z_sig = [i for i, zz in enumerate(z) if zz <= lfsr_cutoff]
z_nsig = [i for i, zz in enumerate(z) if zz > lfsr_cutoff]
colors = [4 if i == 0 else 0 for i in c]
plt.subplot(rows,cols,position[idx])
idx += 1
if len(z_sig):
label = f'{idx}: lfsr<={lfsr_cutoff}'
plt.scatter(np.take(x, z_sig),
np.take(y, z_sig),
c = color_mapper(np.take(colors, z_sig)),
label = label, marker = '*')
if len(z_nsig):
label = f'{idx}: lfsr>{lfsr_cutoff}'
plt.scatter(np.take(x, z_nsig),
np.take(y, z_nsig),
c = color_mapper(np.take(colors, z_nsig)),
label = label, marker = 'x')
plt.legend(bbox_to_anchor=(0,1.02,1,0.2), loc="lower left",
mode="expand", borderaxespad=0, ncol=2, handletextpad=0.1)
plt.subplots_adjust(hspace=0.3, wspace = 0.3)
plt.suptitle(f"95% CI set sizes vs min(abs(LD))\n{'; '.join(capture_summary)}")
plt.savefig(output, dpi=500, bbox_inches='tight')
plt.gca()
out = dscrutils::dscquery('benchmark',
target = "liter_data.dataset simple_lm.pve simple_lm.n_signal fit_susie.estimate_residual_variance fit_susie.prior_var fit_susie plot_susie")
head(out)
dim(out)
It is a lot of results to look at. Here we focus on PVE 20% and 40%, having 2, 3, 5 signals, with prior set to 0.2
. We focus on measuring "purity" by min(abs(LD)).
est_residual=FALSE
¶pve = 0.2
n = 2
est_res = FALSE
prior = 0.2
ld_col = 1 # LD_Min
lfsr_cutoff = 0.05
dirname = 'benchmark/'
sub = out[which(out$simple_lm.pve == pve & out$simple_lm.n_signal == n & out$fit_susie.estimate_residual_variance == est_res & out$fit_susie.prior_var == prior),]
combined = get_combined(sub, dirname, ld_col)
combined
saveRDS(combined, '../data/20180516_susie_bm.rds')
dsc-io ../data/20180516_susie_bm.rds ../data/20180516_susie_bm.pkl
import pickle
data = pickle.load(open('../data/20180516_susie_bm.pkl', 'rb'))
plot_purity(data, '2_p2pve_false.png')