This notebook performs some diagnosis for problems identified from this benchmark.
%cd ~/GIT/mvarbvs/dsc_mnm
Here I load previous benchmark results and identify some scenarios where there is a problem,
out = readRDS('../data/finemap_output.20191116.rds')
res = out[,-1]
colnames(res) = c('n_traits', 'resid_method', 'missing', 'EZ_model', 'L', 'total', 'valid', 'size', 'purity', 'top_hit', 'total_true', 'total_true_included', 'overlap_var', 'overlap_cs', 'false_positive_cross_cond', 'false_negative_cross_cond', 'true_positive_cross_cond', 'elbo_converged', 'filename')
bad = res[which(res$total-res$valid>3),]
bad[which(bad$missing==TRUE & bad$EZ_model==1 & bad$n_traits==5),]
Load the first line of dataset identified,
ex = readRDS('mnm_20191116/mnm_high_het/full_data_340_high_het_1_oracle_generator_1_mnm_high_het_6.rds')
# cat(ex$DSC_DEBUG$script)
DSC_19D98699 <- dscrutils::load_inputs(c('mnm_20191116/oracle_generator/oracle_generator_1.pkl','mnm_20191116/full_data/full_data_340.rds','mnm_20191116/high_het/full_data_340_high_het_1.pkl'), dscrutils::read_dsc)
meta <- DSC_19D98699$meta
The original result from DSC is presented in a plot of PIP and CS, where true signal is in red:
true_pos = as.integer(apply(meta$true_coef, 1, sum) != 0)
susieR::susie_plot(ex$result,y='PIP', main = 'Default SuSiE plot for cross-condition PIP', xlab = 'SNP positions', add_legend = T, b=true_pos)
It seems it completely missed out the causal variant.
When analyzed with non-missing data input the result looks good (analysis changing missing_Y
to FALSE
is not shown here).
Below I use simplified code from DSC script, and use L=1
and max_iter=1
this time to make it easier to manage.
Additionally I temporarily changed the source code to:
It is verified that setting precompute_covariances
to T
or F
does not impact the result. For when there is missing data and residual variance is not diagonal, mvsusieR
forces precomputing covariances to deal with missing data specifically for off-diag elements of various quantities. But since here the residual variance is diagonal matrix the off-diag elements will all be zero anyways -- that is, missing data will not impact off-diag elements in this case.
diff --git a/R/mash_multiple_regression.R b/R/mash_multiple_regression.R
index bf50368..f012173 100644
--- a/R/mash_multiple_regression.R
+++ b/R/mash_multiple_regression.R
@@ -31,6 +31,9 @@ MashMultipleRegression <- R6Class("MashMultipleRegression",
else XtY = d$XtY
# OLS estimates
# bhat is J by R
+ print('---XtY---')
+ rownames(XtY) = NULL
+ print(head(XtY))
bhat = XtY / d$d
if (!is.null(private$precomputed_cov_matrices)) {
sbhat = private$precomputed_cov_matrices$sbhat
@@ -55,6 +58,11 @@ MashMultipleRegression <- R6Class("MashMultipleRegression",
s_alpha = matrix(0,0,0)
}
bhat[which(is.nan(bhat))] = 0
+ rownames(bhat) = NULL
+ print('---bhat---')
+ print(head(bhat))
+ print('---sbhat---')
+ print(head(sbhat))
# Fit MASH model
if (!is.null(private$precomputed_cov_matrices)) is_common_cov = private$precomputed_cov_matrices$common_sbhat
else is_common_cov = is_mat_common(sbhat)
diff --git a/R/regression_data.R b/R/regression_data.R
index b19db2e..583b1eb 100644
--- a/R/regression_data.R
+++ b/R/regression_data.R
@@ -15,6 +15,7 @@ DenseData <- R6Class("DenseData",
Y_missing = is.na(Y)
private$Y_non_missing = !Y_missing
private$.Y_has_missing = any(Y_missing)
+ private$.Y_has_missing = TRUE
private$standardize(center,scale)
private$residual = private$.Y
},
(Use git apply filename.diff
to apply the patch.)
DSC_REPLICATE <- DSC_19D98699$DSC_DEBUG$replicate
X <- DSC_19D98699$X
Y <- DSC_19D98699$Y
cfg <- DSC_19D98699$configurations
prior = cfg[[as.character(ncol(Y))]][['high_het']]
set.seed(DSC_REPLICATE)
compute_cov_diag <- function(Y){
covar <- diag(apply(Y, 2, var, na.rm=T))
return(covar)
}
create_missing <- function(Y1, Y0) {
if (ncol(Y0) < ncol(Y1)) {
for (i in 1:(ncol(Y1) - ncol(Y0))) {
Y0 = cbind(Y0, sample(Y0[, sample.int(ncol(Y0), size=1)]))
}
}
if (ncol(Y0) > ncol(Y1)) Y0 = Y0[,sample(1:ncol(Y1))]
res = Y1
res[which(is.na(Y0))] = NA
na_rows = which(apply(res, 1, function(x) all(is.na(x))))
for (i in na_rows) {
non_na = sample.int(ncol(res), size=1)
res[i,non_na] = Y1[i,non_na]
}
return(res)
}
Y_miss = create_missing(Y, meta$original_Y)
resid_Y <- compute_cov_diag(Y)
This data-set has also been saved to here. If you want to reproduce my results you can download the data-set from the link above, use attach(readRDS(...))
to attach the data to the R workspace and continue with commands below.
Use EE model and set $L=1$,
alpha = 0
L = 1
Then fit with complete data, and with missing data:
m_init = mvsusieR:::MashInitializer$new(NULL, NULL, xUlist=append(list(matrix(0,ncol(Y),ncol(Y))), prior$xUlist), prior_weights=prior$pi, null_weight=prior$null_weight, alpha=alpha, top_mixtures=-1)
r1 = mvsusieR::mvsusie(X, Y, L=L, prior_variance=m_init, residual_variance=resid_Y, compute_objective=F, estimate_residual_variance=F, estimate_prior_variance=F, precompute_covariance = F, standardize=T, max_iter=1)
m_init = mvsusieR:::MashInitializer$new(NULL, NULL, xUlist=append(list(matrix(0,ncol(Y),ncol(Y))), prior$xUlist), prior_weights=prior$pi, null_weight=prior$null_weight, alpha=alpha, top_mixtures=-1)
r2 = mvsusieR::mvsusie(X, Y_miss, L=L, prior_variance=m_init, residual_variance=resid_Y, compute_objective=F, estimate_residual_variance=F, estimate_prior_variance=F, precompute_covariance = F, standardize=T, max_iter=1)
To plot and compare results,
susieR::susie_plot(r1, y='PIP', main = 'No missing data', xlab = 'SNP positions', add_legend = T, b=true_pos)
susieR::susie_plot(r2,y='PIP', main = 'With missing data', xlab = 'SNP positions', add_legend = T, b=true_pos)
Problem reproduced with L=1
. I would expect for such case after randomly removing what happens to be important information in data, there should be a power loss and we should not detect anything. But here, rather, it reports a false signal that we don't like.
For complete data,
m_init = mvsusieR:::MashInitializer$new(NULL, NULL, xUlist=append(list(matrix(0,ncol(Y),ncol(Y))), prior$xUlist), prior_weights=prior$pi, null_weight=prior$null_weight, alpha=alpha, top_mixtures=-1)
m1 = mvsusieR:::MashMultipleRegression$new(ncol(X), resid_Y, m_init)
m1$fit(mvsusieR:::DenseData$new(X, Y), save_summary_stats=TRUE)
max(abs(m1$bhat/m1$sbhat))
which.max(apply(abs(m1$bhat/m1$sbhat), 1, max))
For missing data,
m_init = mvsusieR:::MashInitializer$new(NULL, NULL, xUlist=append(list(matrix(0,ncol(Y),ncol(Y))), prior$xUlist), prior_weights=prior$pi, null_weight=prior$null_weight, alpha=alpha, top_mixtures=-1)
m2 = mvsusieR:::MashMultipleRegression$new(ncol(X), resid_Y, m_init)
m2$fit(mvsusieR:::DenseData$new(X, Y_miss), save_summary_stats=TRUE)
max(abs(m2$bhat/m2$sbhat))
The max z-score is 253 -- very problematic.
head(m2$bhat/m2$sbhat, 10)
The 4th column (4th condition) has large z-scores. This is consistent with observed difference in $X^TY$ between missing and non-missing scenarios as shown in the debug log above. Removing the 4th condition:
max(abs(m2$bhat/m2$sbhat)[,-4])
To find which row it is in:
which.max(apply(abs(m2$bhat/m2$sbhat)[,-4], 1, max))
This is indeed the true causal SNP.
Now checkout the logBF:
plot(m1$lbf)
plot(m2$lbf)
The lbf here are not under the null. They are insanely large! In particular the Log BF for the false positive variable in the presense of missing data is a lot higher than where the simulated signal lies, thus the false positive result.
This must be a very obvious bug because the marginal statistics is even wrong here with missing data. It is easy to show that with susieR::univariate_regression
function the result for missing data is very different from what I got here above.
Missingness across samples:
summary(apply(Y_miss,1,function(i) sum(is.na(i)))/ ncol(Y_miss))
Missingness across conditions:
apply(Y_miss,2,function(i) sum(is.na(i))) / nrow(Y_miss)
Precomputed quantities with this data-set, compared with complete data:
m1 = mvsusieR:::MashInitializer$new(NULL, NULL, xUlist=append(list(matrix(0,ncol(Y),ncol(Y))), prior$xUlist), prior_weights=prior$pi, null_weight=prior$null_weight, alpha=alpha, top_mixtures=-1)
m1$precompute_cov_matrices(mvsusieR:::DenseData$new(X, Y_miss), resid_Y)
m2 = mvsusieR:::MashInitializer$new(NULL, NULL, xUlist=append(list(matrix(0,ncol(Y),ncol(Y))), prior$xUlist), prior_weights=prior$pi, null_weight=prior$null_weight, alpha=alpha, top_mixtures=-1)
m2$precompute_cov_matrices(mvsusieR:::DenseData$new(X, Y), resid_Y)
matrix(m1$precomputed$Vinv, ncol(Y), ncol(Y))
matrix(m2$precomputed$Vinv, ncol(Y), ncol(Y))
matrix(m1$precomputed$U0, ncol(Y), 2*ncol(Y))
matrix(m2$precomputed$U0, ncol(Y), 2*ncol(Y))
matrix(m1$precomputed$sigma_rooti, ncol(Y), 2*ncol(Y))
matrix(m2$precomputed$sigma_rooti, ncol(Y), 2*ncol(Y))
head(m1$precomputed$sbhat)
head(m2$precomputed$sbhat)
The outcome are largely comparable.