This benchmark uses the latest GTEx V8 genotype data and evaluated the pipeline in the presence of missing data.
flashier::flash
methods and simple diagonal methods were used to compute covariance of response to use as residual covariance.mvsusieR
. The benchmark is now under dsc_mnm
, running on UChicago RCC midway
./finemap.dsc --host mnm_dsc.yaml
This executes the default
pipeline in finemap.dsc
file, as of today (2019.11.08).
%cd ~/GIT/mvarbvs/dsc_mnm
start_time <- Sys.time()
out = dscrutils::dscquery('finemap_output', targets = c('simulate', 'mnm.resid_method', 'mnm.missing_Y', 'susie_scores.total', 'susie_scores.valid', 'susie_scores.size', 'susie_scores.purity', 'susie_scores.top', 'susie_scores.n_causal', 'susie_scores.included_causal', 'susie_scores.overlap', 'susie_scores.false_pos_cond_discoveries', 'susie_scores.false_neg_cond_discoveries', 'susie_scores.true_cond_discoveries'), verbose = F)
end_time <- Sys.time()
end_time - start_time
head(out)
dim(out)
saveRDS(out, '../data/finemap_output.20191108.rds')
res = out[,-c(1,2)]
colnames(res) = c('resid_method', 'missing', 'total', 'valid', 'size', 'purity', 'top_hit', 'total_true', 'total_true_included', 'overlap', 'false_positive_cross_cond', 'false_negative_cross_cond', 'true_positive_cross_cond')
Yes purity is higher with missing data --- but because many of those CS are false positives! (see below)
purity = aggregate(purity~resid_method + missing, res, mean)
purity
size = aggregate(size~resid_method+missing, res, median)
size
Notice here that many CS overlap -- this is not what was observed with $R=5$.
total_true_included = aggregate(total_true_included ~ resid_method + missing, res, sum)
total_true = aggregate(total_true ~ resid_method + missing, res, sum)
overlap = aggregate(overlap ~ resid_method + missing, res, mean)
power = merge(total_true_included, total_true, by = c("resid_method", "missing"))
power = merge(power, overlap, by = c("resid_method", "missing"))
power$power = power$total_true_included/power$total_true
power = power[order(power$missing),]
power
The high FDR explains the seemingly high power, and is consistent with the observations that CS are "purer".
valid = aggregate(valid ~ resid_method + missing, res, sum)
total = aggregate(total ~ resid_method + missing, res, sum)
fdr = merge(valid, total, by = c("resid_method", "missing"))
fdr$fdr = (fdr$total - fdr$valid)/fdr$total
fdr = fdr[order(fdr$missing),]
fdr
We compute lfsr on per signal per condition basis. We call it a signal in the condition if lfsr is smaller than 0.05.
tp = aggregate(true_positive_cross_cond ~ resid_method + missing, res, sum)
fn = aggregate(false_negative_cross_cond ~ resid_method + missing, res, sum)
power = merge(tp, fn, by = c("resid_method", "missing"))
power$power = power$true_positive_cross_cond/(power$true_positive_cross_cond + power$false_negative_cross_cond)
power = power[order(power$missing),]
power
tp = aggregate(true_positive_cross_cond ~ resid_method + missing, res, sum)
fp = aggregate(false_positive_cross_cond ~ resid_method + missing, res, sum)
fdr = merge(tp, fp, by = c("resid_method", "missing"))
fdr$fdr = fdr$false_positive_cross_cond/(fdr$true_positive_cross_cond + fdr$false_positive_cross_cond)
fdr = fdr[order(fdr$missing),]
fdr