%revisions -s -n 10
%cd ~/GIT/github/mvarbvs/dsc/susie_comparison/fit_susie
[global]
cwd = path('~/GIT/github/mvarbvs/dsc')
dirname = path(f'{cwd:a}/susie_comparison/')
parameter: name = '20180710'
Here I'd like to pick up ideally examples that has enough power for susie to detect 3 simulated signals, at non-trivial yet reasonable number of iterations, size of sets and purity levels, for illustrating how susie works. In other words this is meant to pick up good susie show cases, not edge cases.
[A]
num_causal = 3
input: glob.glob(f'liter_data_*_summarize_ld_1_lm_less_{num_causal}_fit_susie_*.rds')[:200], group_by = 'single'
R: expand = '${ }'
res = readRDS(${_input:r})
for (r in 1:2) {
cs = susieR::susie_get_CS(res$posterior$alpha[[r]])$cs[1:${num_causal}]
if (length(cs[[${num_causal}]]) < 20) {
print(${_input:r})
print(r)
print(res$posterior$niter[r])
print(cs)
print("========")
}
}
I ended up with liter_data_65_summarize_ld_1_lm_less_3_fit_susie_7.rds
the first data-set, which seems interesting:
[[1]]
[1] 773 777
[[2]]
[1] 360 361 362 365 368 372 373 374 379 381 383 384 386 387 388 389 391 392 396
[20] 397 398 399 400 401 403 404 405 407 408 415
[[3]]
[1] 653
takes 6 iterations to complete.
dat = readRDS('liter_data_65_summarize_ld_1_lm_less_3_fit_susie_7.rds')
names(dat)
From dat$DSC_DEBUG$script
I load the dataset of interest:
input <- dscrutils::load_inputs(c('../lm_less/liter_data_65_summarize_ld_1_lm_less_3.pkl'), dscrutils::read_dsc)
names(input)
Now I compute summary stats and save it.
library(abind)
mm_regression = function(X, Y, Z=NULL) {
if (!is.null(Z)) {
Z = as.matrix(Z)
}
reg = lapply(seq_len(ncol(Y)), function (i) simplify2array(susieR:::univariate_regression(X, Y[,i], Z)))
reg = do.call(abind, c(reg, list(along=0)))
# return array: out[1,,] is betahat, out[2,,] is shat
return(aperm(reg, c(3,2,1)))
}
sumstats = mm_regression(as.matrix(input$data$X), as.matrix(input$data$Y))
Since the data is private I may have to remove column and row names from data matrices:
names(input$data)
Okay after checking the data details there is nothing confidential to hide. We should be good.
saveRDS(list(data=input$data, sumstats = sumstats), '~/GIT/software/susieR/inst/data/N3finemapping.rds')
We hope to see and show that SuSiE can deal with this reasonably difficult case where in the 2 CS setting the top z-score is result of contribution from both CS -- that is, a SNP is in weak LD between both of 2 CS, thus showing strongest z-score in univariate analysis but weak PIP and not in any CS. Need information on:
[B_1]
target_susie = "liter_data.dataset lm_less.pve lm_less.n_signal fit_susie.estimate_residual_variance fit_susie.prior_var fit_susie"
target_sumstats = "liter_data.dataset lm_less.pve lm_less.n_signal get_sumstats lm_less"
output: f'{dirname}/tutorial_{name}/result.RDS'
R: expand = '${ }'
susie_out = dscrutils::dscquery(${dirname:r}, target = "${target_susie}", load.pkl = TRUE)
sumstats_out = dscrutils::dscquery(${dirname:r}, target = "${target_sumstats}", load.pkl = FALSE)
saveRDS(list(susie_out=susie_out, sumstats_out=sumstats_out), ${_output:r})
[B_2]
pve = [0.2]
n = [2,3]
est_res = ['TRUE']
prior = [0.2] # 0.2 / (1-0.2)
combos = len(pve) * len(n) * len(est_res) * len(prior)
input: for_each = ['pve', 'n', 'est_res', 'prior'], concurrent = True
output: dynamic(glob.glob(f'{_input:d}/*.rds'))
R: expand = '${ }'
options(width=999)
out = readRDS(${_input:r})
susie_out = out$susie_out
sumstats_out = out$sumstats_out
# favorit susie flavor
susie_out = susie_out[which(susie_out$fit_susie.prior_var == ${_prior} & susie_out$fit_susie.estimate_residual_variance == ${_est_res} & susie_out$lm_less.n_signal == ${_n} & susie_out$lm_less.pve == ${_pve}), ]
sumstats_out = sumstats_out[which(sumstats_out$lm_less.n_signal == ${_n} & sumstats_out$lm_less.pve == ${_pve}), ]
data_sets = unique(susie_out$liter_data.dataset)
counter = 0
for (d in data_sets) {
susie_files = susie_out[susie_out$liter_data.dataset == d, c("fit_susie.output.file"), drop=F]
fit = readRDS(paste0(${dirname:r}, '/', susie_files[1,1], '.rds'))$posterior
sumstats_files = sumstats_out[sumstats_out$liter_data.dataset == d, c("get_sumstats.output.file", "lm_less.output.file")]
z_score = readRDS(paste0(${dirname:r}, '/', sumstats_files[1,1], '.rds'))$sumstats
z_score = z_score[1,,]/z_score[2,,]
dat = readRDS(paste0(${dirname:r}, '/', sumstats_files[1,2], '.rds'))$data
truth = dat$true_coef
X = dat$X
for (r in c(1,2)) {
z = z_score[,r]
signals = which(truth[,r]!=0)
sets = susieR::susie_get_CS(fit$alpha[[r]], X=X)
pip = susieR::susie_get_PIP(fit$alpha[[r]], sets$cs_index)
cs = lapply(1:length(sets$cs), function(i) min(sets$cs[[i]]):max(sets$cs[[i]]))
top_z = which.max(abs(z))
if (! (top_z %in% unlist(cs)) && any(signals %in% unlist(sets$cs)) && pip[top_z] < 0.1 && length(sets$cs) == ${_n}) {
counter = counter + 1
print(c('=====', counter, '======='))
print(top_z)
print(sets)
print(signals)
print(fit$niter[[r]])
print(r)
saveRDS(dat, paste0(${_input:dr}, '/', counter, '_', r, '.rds'))
}
}
}
%cd /home/gaow/GIT/github/mvarbvs/dsc/susie_comparison/tutorial_20180710
!ls *.rds
I think this case below looks good. I'll make a separate notebook for it to demonstrate.
dat = readRDS('2_1.rds')
r = 1
#names(dat)
pve = 0.2 # we know from simulation parameters though not saved to this data object ...
library(susieR)
set.seed(1)
b = dat$true_coef[,r]
b[which(b!=0)] = 1
z_score = susieR:::calc_z(dat$X, dat$Y[,r])
susie_pplot(z_score, dtype='z', b=b)
fitted = susie(dat$X, dat$Y[,r], L=5,
estimate_residual_variance=TRUE,
prior_variance=pve/(1-pve),
tol=1e-3, track_fit=TRUE)
sets = susie_get_CS(fitted,
coverage = 0.95,
X = dat$X,
min_abs_corr = 0.4)
pip = susie_get_PIP(fitted, sets$cs_index)
susie_pplot(pip, fitted = fitted, CS = sets, max_cs = 0.4, dtype='PIP',b=b)