DAP and CAVIAR is performed on some of the same input data and is compared with susie
L = 1.
This notebook was copied from this one but changed it to using the susie
L = 1 flavor, the fit_susie01
module.
For the case of N = 1, using L = 1 increased correlation between CAVIAR from 97% to 99%. But the 3 ourlier cases are still there.
FIXME another thing we found is that for case of L = 1, the FDR rate is mostly the same as before, though of course power is lower. This makes sense but needs more thought to make sure I really understand what is going on.
%revisions -s
Here I consider the following scenarios:
For CAVIAR I only try and report 1~3 causal scenarios.
The plan is to get the PIP for those in susieR mappable CS (purity > 0.2 or 0.25), and compare these PIP to what DAP and CAVIAR reports. For susie flavors:
This setting of susie should reflect its best performance. Additionally I check both the PIP computed before purity filter, and that after purity filter.
Previously I've ran this specific DSC using:
dsc susie.dsc --target run_comparison -o susie_comparison
So here I query from that result.
[global]
cwd = path('~/GIT/github/mvarbvs/dsc')
dirname = path(f'{cwd:a}/susie_comparison/')
ld_col = 1
susie_prior = 0.1
#susie_prior = 0.05
[pip_1, fdr_1]
output: f'{dirname}/PIP_comparison_0530.rds'
R: expand = '${ }', workdir = cwd
dap_out = dscrutils::dscquery(${dirname:br},
target = "liter_data.dataset lm_less lm_less.pve lm_less.n_signal fit_dap plot_dap",
load.pkl = F)
susie_out = dscrutils::dscquery(${dirname:br},
target = "liter_data.dataset lm_less lm_less.pve lm_less.n_signal fit_susie01.prior_var fit_susie01.estimate_residual_variance fit_susie01 plot_susie",
load.pkl = F)
caviar_out = dscrutils::dscquery(${dirname:br},
target = "liter_data.dataset lm_less lm_less.pve lm_less.n_signal fit_caviar.args fit_caviar plot_caviar",
load.pkl = F)
saveRDS(list(dap=dap_out, susie=susie_out, caviar=caviar_out), ${_output:r})
[pip_2]
pip_after_filter = ['FALSE', 'TRUE']
ld_cutoff = [0,0.2]
input: for_each = ['pip_after_filter', 'ld_cutoff'], concurrent = True
output: paths([f'{_input:n}.{x}_filter_{_pip_after_filter.lower()}_{str(_ld_cutoff).replace(".", "p")}.png' for x in ['susie_dap', 'susie_caviar', 'dap_caviar']])
R: stdout = f'{_output[0]:n}.log', expand = '${ }', workdir = cwd
ld_col = ${ld_col}
ld_cutoff = ${_ld_cutoff}
pip_cutoff = 0
dat = readRDS(${_input:r})
dap_out = dat$dap
caviar_out = dat$caviar
susie_out = dat$susie
# favorit susie flavor
susie_out = susie_out[which(susie_out$fit_susie01.prior_var == ${susie_prior} & susie_out$fit_susie01.estimate_residual_variance == FALSE), ]
susie_out = subset(susie_out, select =-c(lm_less.pve, fit_susie01.prior_var, fit_susie01.estimate_residual_variance))
data_sets = unique(susie_out$liter_data.dataset)
signals = unique(susie_out$lm_less.n_signal)
result = list()
for (s in signals) {
result[[as.character(s)]] = NULL
if (s > 3) {
has_caviar = FALSE
} else {
has_caviar = TRUE
}
print(paste('==============', s, '=============='))
for (d in data_sets) {
out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d),c("fit_susie01.output.file", "plot_susie.output.file", "lm_less.output.file")]
fit = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
purity = readRDS(paste0(${dirname:r}, '/', out_files[1,2], '.rds'))
truth = readRDS(paste0(${dirname:r}, '/', out_files[1,3], '.rds'))$data$true_coef
signals = which(truth[,1]!=0)
if (${_pip_after_filter}) {
alpha = fit$alpha[[1]][which(purity$purity$V1[,ld_col] > ld_cutoff),,drop=FALSE]
} else {
alpha = fit$alpha[[1]]
}
pip = t(1 - apply(1 - alpha, 2, prod))
in_CI_raw = fit$in_CI[[1]]
in_CI_raw = in_CI_raw[which(purity$purity$V1[,ld_col] > ld_cutoff),,drop=FALSE]
in_CI = which(colSums(in_CI_raw) > 0)
pip = pip[in_CI]
out_files = dap_out[which(dap_out$lm_less.n_signal == s & dap_out$liter_data.dataset == d),c("fit_dap.output.file"),drop=FALSE]
dap = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
dap = dap$V0$snp
#print(head(dap, length(pip)))
dap = dap[which(dap$snp %in% as.character(in_CI)),]
dap = dap[match(in_CI, dap$snp),]
#print(dap)
#print(pip)
#print(in_CI)
if (has_caviar) {
out_files = caviar_out[which(caviar_out$lm_less.n_signal == s & caviar_out$liter_data.dataset == d & caviar_out$fit_caviar.args == paste('-c', s)),c("fit_caviar.output.file"),drop=FALSE]
caviar = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
caviar = caviar[[1]]$snp
caviar = caviar[which(caviar$snp %in% as.character(in_CI)),]
caviar = caviar[match(in_CI, caviar$snp),]
pip = cbind(pip, as.vector(dap$snp_prob), as.vector(caviar$snp_prob), in_CI %in% signals)
} else {
pip = cbind(pip, as.vector(dap$snp_prob), in_CI %in% signals)
}
## BEGIN debug
outlier = pip[which(pip[,1] < 0.2 & pip[,2]>0.9), ,drop=F]
if (nrow(outlier)>0 && s == 1) {
print("DAP outlier")
print(d)
}
if (has_caviar && s == 1) {
conflict = pip[which(pip[,1] < 0.95 & pip[,3] > 0.95), ,drop=F]
if (nrow(conflict) > 0) {
print("CAVIAR-susie conflict")
print(d)
print("CAVIAR")
print(caviar[which(caviar$snp_prob>0.95),])
print("susie")
print(purity$purity$V1[,ld_col])
print(rowSums(in_CI_raw))
}
}
## END debug
if (is.null(result[[as.character(s)]])) {
result[[as.character(s)]] = pip
} else {
result[[as.character(s)]] = rbind(result[[as.character(s)]], pip)
}
}
result[[as.character(s)]] = data.frame(result[[as.character(s)]])
if (has_caviar) {
colnames(result[[as.character(s)]]) = c('susie', 'dap', 'caviar', 'is_signal')
} else {
colnames(result[[as.character(s)]]) = c('susie', 'dap', 'is_signal')
}
}
# susie vs dap
png(${_output[0]:r}, 600, 800)
#par(mar=c(.5,.5,.5,.5))
par(mfrow=c(3, 2))
for (i in 1:5) {
i = as.character(i)
x = result[[i]][result[[i]]$susie > pip_cutoff & result[[i]]$dap > pip_cutoff,]
colors = sapply(1:length(x$is_signal), function(i) ifelse(x$is_signal[i],'#800000','#002b36'))
plot(x$susie, x$dap, xlab = paste('PIP susie >', pip_cutoff), ylab = paste('PIP DAP >', pip_cutoff),
main = paste('num. causal:', i, '\ncor:', round(cor(x)[1,2],2)),
col = colors, pch = 20, cex = 1.5)
abline(0,1,col=2)
abline(h=0.95, col='gray')
abline(v=0.95, col='gray')
}
dev.off()
# susie vs caviar
png(${_output[1]:r}, 600, 800)
#par(mar=c(.5,.5,.5,.5))
par(mfrow=c(2, 2))
for (i in 1:3) {
i = as.character(i)
x = result[[i]][result[[i]]$susie > pip_cutoff & result[[i]]$caviar > pip_cutoff,]
colors = sapply(1:length(x$is_signal), function(i) ifelse(x$is_signal[i],'#800000','#002b36'))
plot(x$susie, x$caviar, xlab = paste('PIP susie >', pip_cutoff), ylab = paste('PIP CAVIAR >', pip_cutoff),
main = paste('num. causal:', i, '\ncor:', round(cor(x)[1,2],2)),
col = colors, pch = 20, cex = 1.5)
abline(0,1,col=2)
abline(h=0.95, col='gray')
abline(v=0.95, col='gray')
}
dev.off()
# dap vs caviar
png(${_output[2]:r}, 600, 800)
#par(mar=c(.5,.5,.5,.5))
par(mfrow=c(2, 2))
for (i in 1:3) {
i = as.character(i)
x = result[[i]][result[[i]]$dap > pip_cutoff & result[[i]]$caviar > pip_cutoff,]
colors = sapply(1:length(x$is_signal), function(i) ifelse(x$is_signal[i],'#800000','#002b36'))
plot(x$dap, x$caviar, xlab = paste('PIP DAP >', pip_cutoff), ylab = paste('PIP CAVIAR >', pip_cutoff),
main = paste('num. causal:', i, '\ncor:', round(cor(x)[1,2],2)),
col = colors, pch = 20, cex = 1.5)
abline(0,1,col=2)
abline(h=0.95, col='gray')
abline(v=0.95, col='gray')
}
dev.off()
[fdr_2]
# computer fdr
# to match with DAP -ld_control 0.25
ld_cutoff = 0.25
# to match with susie 95% mappable CS, we set dap cutoff to 0.95 also
dap_cluster_cutoff = [('cluster_prob', 0.95), ('cluster_avg_r2', 0.25)]
input: for_each = 'dap_cluster_cutoff', group_by = 1, concurrent = True
output: f'{dirname}/FDR_comparison_0530_{_dap_cluster_cutoff[0]}.rds'
R: stdout = f'{_output:n}.log', expand = '${ }', workdir = cwd
ld_col = ${ld_col}
ld_cutoff = ${ld_cutoff}
dat = readRDS(${_input:r})
dap_out = dat$dap
susie_out = dat$susie
# favorit susie flavor
susie_out = susie_out[which(susie_out$fit_susie01.prior_var == ${susie_prior} & susie_out$fit_susie01.estimate_residual_variance == FALSE), ]
susie_out = subset(susie_out, select =-c(lm_less.pve, fit_susie01.prior_var, fit_susie01.estimate_residual_variance))
data_sets = unique(susie_out$liter_data.dataset)
signals = unique(susie_out$lm_less.n_signal)
result = NULL
for (s in signals) {
susie_tdc = 0
dap_tdc = 0
susie_dc = 0
dap_dc = 0
susie_tc = 0
dap_tc = 0
for (d in data_sets) {
out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d), c("fit_susie01.output.file", "plot_susie.output.file", "lm_less.output.file")]
fit = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
purity = readRDS(paste0(${dirname:r}, '/', out_files[1,2], '.rds'))
truth = readRDS(paste0(${dirname:r}, '/', out_files[1,3], '.rds'))$data$true_coef
signals = which(truth[,1]!=0)
# susie in CS
susie_cs = fit$in_CI[[1]]
susie_cs_raw = susie_cs[which(purity$purity$V1[,ld_col] > ld_cutoff),,drop=FALSE]
susie_cs = list()
if (nrow(susie_cs_raw) > 0) {
for (i in 1:nrow(susie_cs_raw)) {
susie_cs[[i]] = which(susie_cs_raw[i,] > 0)
if (length(susie_cs[[i]]) == 0) {
susie_tc = susie_tc - 1
next
}
if (any(signals %in% susie_cs[[i]])) {
susie_tdc = susie_tdc + 1
susie_dc = susie_dc + 1
}
}
}
print(paste('==============', s, '=============='))
print(susie_cs)
susie_tc = susie_tc + length(susie_cs)
# DAP in cluster
out_files = dap_out[which(dap_out$lm_less.n_signal == s & dap_out$liter_data.dataset == d),c("fit_dap.output.file"),drop=FALSE]
dap = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
dap_cluster_raw = dap$V0$set[which(dap$V0$set$${_dap_cluster_cutoff[0]} > ${_dap_cluster_cutoff[1]}), ]$snp
dap_cluster = list()
if (length(dap_cluster_raw) > 0) {
for (i in 1:length(dap_cluster_raw)) {
dap_cluster[[i]] = as.integer(unlist(strsplit(dap_cluster_raw[i], ",")))
if (any(signals %in% dap_cluster[[i]])) {
dap_tdc = dap_tdc + 1
dap_dc = dap_dc + 1
}
}
}
print(dap_cluster)
dap_tc = dap_tc + length(dap_cluster)
## BEGIN debug
## susie made more true discovery than DAP
if (susie_dc > dap_dc) {
print('DAP miss')
print(dap$V0$set)
print(d)
}
## DAP made some (false) discovery, susie did not
## under n = 1
if (length(dap_cluster) > dap_dc && s == 1) {
print('DAP false discovery')
print(dap$V0$set)
print(d)
}
## END debug
susie_dc = 0
dap_dc = 0
}
rates = c(s, s* 50, susie_tc, dap_tc, susie_tdc/susie_tc, dap_tdc/dap_tc, 1 - (susie_tdc/susie_tc), 1 - (dap_tdc/dap_tc))
if (is.null(result)) {
result = rates
} else {
result = rbind(result, rates)
}
}
colnames(result) = c('n_signal', 'expected_discoveries', 'susie_discoveries', 'dap_discoveries', 'susie_tdr', 'dap_tdr', 'susie_fdr', 'dap_fdr')
rownames(result) = as.character(result[,1])
saveRDS(data.frame(result), ${_output:r})
%preview ~/GIT/github/mvarbvs/dsc/susie_comparison/PIP_comparison_0530.susie_dap_filter_true_0.png
%preview ~/GIT/github/mvarbvs/dsc/susie_comparison/PIP_comparison_0530.susie_dap_filter_false_0.png
%preview ~/GIT/github/mvarbvs/dsc/susie_comparison/PIP_comparison_0530.susie_caviar_filter_true_0.png
%preview ~/GIT/github/mvarbvs/dsc/susie_comparison/PIP_comparison_0530.dap_caviar_filter_true_0.png
Same situation as before.
DAP cluster is filtered by 95% cluster_prob
%cd ~/GIT/github/mvarbvs/dsc
readRDS('susie_comparison/FDR_comparison_0530_cluster_prob.rds')
Okay then we know what happens when L is too smaller ... The tdr rate did not change much, though. This is interesting.