M&M benchmark VIII¶

This benchmark uses the latest GTEx V8 genotype data and evaluated the pipeline in the presence of missing data.

the number of conditions are increased to $R=45$
missing data in expression are simulated according to missingness pattern in the actual expression cross tissues; Both flashier::flash methods and simple diagonal methods were used to compute covariance of response to use as residual covariance.

Conclusion¶

Our pipeline with missing data has high false positive rates even though the simulated residual correlation is diagonal.
When the underlying pattern of residual covariance is diagonal, FLASH based method suffer from quite a bit power loss as shown in simulations without missing data.

Next steps for this investigation¶

Figure out the problem (hopefully bug) with missing data handling in mvsusieR.
- An obvious thing to do is to add more unit tests for missing data although we already have a couple of unit tests for it. But hopefully more tests can catch something obvious.
Add a diagnostic function to compute in between CS correlation.

The benchmark is now under dsc_mnm, running on UChicago RCC midway

./finemap.dsc --host mnm_dsc.yaml

This executes the default pipeline in finemap.dsc file, as of today (2019.11.08).

%cd ~/GIT/mvarbvs/dsc_mnm

/project2/mstephens/gaow/mvarbvs/dsc_mnm

start_time <- Sys.time()
out = dscrutils::dscquery('finemap_output', targets = c('simulate', 'mnm.resid_method', 'mnm.missing_Y', 'susie_scores.total', 'susie_scores.valid', 'susie_scores.size', 'susie_scores.purity', 'susie_scores.top', 'susie_scores.n_causal', 'susie_scores.included_causal', 'susie_scores.overlap', 'susie_scores.false_pos_cond_discoveries', 'susie_scores.false_neg_cond_discoveries', 'susie_scores.true_cond_discoveries'), verbose = F)
end_time <- Sys.time()

end_time - start_time

Time difference of 14.60613 secs

head(out)

dim(out)

saveRDS(out, '../data/finemap_output.20191108.rds')

res = out[,-c(1,2)]
colnames(res) = c('resid_method', 'missing', 'total', 'valid', 'size', 'purity', 'top_hit', 'total_true', 'total_true_included', 'overlap', 'false_positive_cross_cond', 'false_negative_cross_cond', 'true_positive_cross_cond')

Purity of CS¶

Yes purity is higher with missing data --- but because many of those CS are false positives! (see below)

purity = aggregate(purity~resid_method + missing, res, mean)
purity

Size of CS¶

size = aggregate(size~resid_method+missing, res, median)
size

Power of CS¶

Notice here that many CS overlap -- this is not what was observed with $R=5$.

total_true_included = aggregate(total_true_included ~ resid_method + missing, res, sum)
total_true = aggregate(total_true ~ resid_method + missing, res, sum)
overlap = aggregate(overlap ~ resid_method + missing, res, mean)
power = merge(total_true_included, total_true, by = c("resid_method", "missing"))
power = merge(power, overlap,  by = c("resid_method", "missing"))
power$power = power$total_true_included/power$total_true
power = power[order(power$missing),]
power

FDR of CS¶

The high FDR explains the seemingly high power, and is consistent with the observations that CS are "purer".

valid = aggregate(valid ~ resid_method + missing, res, sum)
total = aggregate(total ~ resid_method + missing, res, sum)
fdr = merge(valid, total, by = c("resid_method", "missing"))
fdr$fdr = (fdr$total - fdr$valid)/fdr$total
fdr = fdr[order(fdr$missing),]
fdr

Power for per signal per condition estimates¶

We compute lfsr on per signal per condition basis. We call it a signal in the condition if lfsr is smaller than 0.05.

tp = aggregate(true_positive_cross_cond ~ resid_method + missing, res, sum)
fn = aggregate(false_negative_cross_cond ~ resid_method + missing, res, sum)
power = merge(tp, fn, by = c("resid_method", "missing"))

power$power = power$true_positive_cross_cond/(power$true_positive_cross_cond + power$false_negative_cross_cond)
power = power[order(power$missing),]
power

FDR for per signal per condition estimates¶

tp = aggregate(true_positive_cross_cond ~ resid_method + missing, res, sum)
fp = aggregate(false_positive_cross_cond ~ resid_method + missing, res, sum)
fdr = merge(tp, fp, by = c("resid_method", "missing"))
fdr$fdr = fdr$false_positive_cross_cond/(fdr$true_positive_cross_cond + fdr$false_positive_cross_cond)
fdr = fdr[order(fdr$missing),]
fdr

DSC	simulate	mnm.resid_method	mnm.missing_Y	susie_scores.total	susie_scores.valid	susie_scores.size	susie_scores.purity	susie_scores.top	susie_scores.n_causal	susie_scores.included_causal	susie_scores.overlap	susie_scores.false_pos_cond_discoveries	susie_scores.false_neg_cond_discoveries	susie_scores.true_cond_discoveries
1	mid_het	flash	TRUE	2	2	23.5	0.8482883	2	1	1	23	0	4	86
1	mid_het	flash	TRUE	1	1	13.0	0.9527061	1	3	2	0	0	8	37
1	mid_het	flash	TRUE	6	1	2.0	0.9974367	0	1	1	0	16	216	38
1	mid_het	flash	TRUE	5	5	37.0	0.9691339	0	2	2	125	0	57	168
1	mid_het	flash	TRUE	1	1	20.0	0.9178380	0	2	1	0	0	1	44
1	mid_het	flash	TRUE	1	1	12.0	0.9302628	0	3	1	0	0	6	39

resid_method	missing	purity
diag	FALSE	0.9670738
flash	FALSE	0.6914748
diag	TRUE	0.9349031
flash	TRUE	0.8479686

resid_method	missing	size
diag	FALSE	6.00
flash	FALSE	7.00
diag	TRUE	7.75
flash	TRUE	7.25

	resid_method	missing	total_true_included	total_true	overlap	power
1	diag	FALSE	158	162	194.40	0.9753086
3	flash	FALSE	116	162	103.48	0.7160494
2	diag	TRUE	140	162	242.23	0.8641975
4	flash	TRUE	112	162	119.73	0.6913580

	resid_method	missing	valid	total	fdr
1	diag	FALSE	320	320	0.000000000
3	flash	FALSE	169	170	0.005882353
2	diag	TRUE	281	369	0.238482385
4	flash	TRUE	190	284	0.330985915

	resid_method	missing	true_positive_cross_cond	false_negative_cross_cond	power
1	diag	FALSE	8871	5529	0.6160417
3	flash	FALSE	4467	3176	0.5844564
2	diag	TRUE	4311	10076	0.2996455
4	flash	TRUE	3613	7168	0.3351266