%cd ~/GIT/github/mvarbvs
From my log-file records,
[1] "============== 1 =============="
[1] "DAP outlier"
[[1]]
[1] 1
[[2]]
[1] "~/Documents/GTExV8/Toys/Thyroid.ENSG00000105427.RDS"
$lm_less.output.file
[1] "lm_less/liter_data_9_summarize_ld_1_lm_less_1"
[1] "DAP outlier"
[[1]]
[1] 1
[[2]]
[1] "~/Documents/GTExV8/Toys/Thyroid.ENSG00000251484.RDS"
$lm_less.output.file
[1] "lm_less/liter_data_125_summarize_ld_1_lm_less_1"
[1] "============== 2 =============="
[1] "DAP outlier"
[[1]]
[1] 1
[[2]]
[1] "~/Documents/GTExV8/Toys/Thyroid.ENSG00000140990.RDS"
$lm_less.output.file
[1] "lm_less/liter_data_35_summarize_ld_1_lm_less_2"
[1] "DAP outlier"
[[1]]
[1] 1
[[2]]
[1] "~/Documents/GTExV8/Toys/Thyroid.ENSG00000188229.RDS"
$lm_less.output.file
[1] "lm_less/liter_data_91_summarize_ld_1_lm_less_2"
[1] "DAP outlier"
[[1]]
[1] 1
[[2]]
[1] "~/Documents/GTExV8/Toys/Thyroid.ENSG00000227527.RDS"
$lm_less.output.file
[1] "lm_less/liter_data_106_summarize_ld_1_lm_less_2"
[1] "DAP outlier"
[[1]]
[1] 1
[[2]]
[1] "~/Documents/GTExV8/Toys/Thyroid.ENSG00000276644.RDS"
$lm_less.output.file
[1] "lm_less/liter_data_148_summarize_ld_1_lm_less_2"
[global]
parameter: data_file = path('dsc/susie_comparison/lm_less/liter_data_91_summarize_ld_1_lm_less_2.pkl')
parameter: r = 1
parameter: cwd = path('~/tmp')
out_prefix = path(f'{cwd}/{data_file:bn}_dap_outlier')
[dap_1]
args = '-ld_control 0.20 --all'
input: data_file
output: f'{out_prefix}.dap.pkl'
python: expand = '${ }'
import sys
import subprocess
import pandas as pd
import numpy as np
def write_dap_full(x,y,prefix,r):
names = np.array([('geno', i+1, f'group{r}') for i in range(x.shape[1])])
with open(f'{prefix}.data', 'w') as f:
print(*(['pheno', 'pheno', f'group{r}'] + list(np.array(y).ravel())), file=f)
np.savetxt(f, np.hstack((names, x.T)), fmt = '%s', delimiter = ' ')
def run_dap_full(prefix, args):
cmd = ['dap-g', '-d', f'{prefix}.data', '-o', f'{prefix}.result', '--output_all'] + ' '.join(args).split()
subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
def write_dap_ss(z,prefix):
'''z-score vesion of dap input is the same as FINEMAP'''
ids = np.array([str(i+1) for i in range(z.shape[0])])
with open(f'{prefix}.z', 'w') as f:
np.savetxt(f, np.vstack((ids, z)).T, fmt = '%s', delimiter = ' ')
def run_dap_z(ld, prefix, args):
cmd = ['dap-g', '-d_z', f'{prefix}.z', '-d_ld', ld, '-o', f'{prefix}.result', '--all'] + ' '.join(args).split()
subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
def extract_dap_output(prefix):
out = [x.strip().split() for x in open(f'{prefix}.result').readlines()]
pips = []
clusters = []
still_pip = True
for line in out:
if len(line) == 0:
continue
if len(line) > 2 and line[2] == 'cluster_pip':
still_pip = False
continue
if still_pip and (not line[0].startswith('((')):
continue
if still_pip:
pips.append([line[1], float(line[2]), float(line[3]), int(line[4])])
else:
clusters.append([len(clusters) + 1, float(line[2]), float(line[3])])
pips = pd.DataFrame(pips, columns = ['snp', 'snp_prob', 'snp_log10bf', 'cluster'])
clusters = pd.DataFrame(clusters, columns = ['cluster', 'cluster_prob', 'cluster_avg_r2'])
clusters = pd.merge(clusters, pips.groupby(['cluster'])['snp'].apply(','.join).reset_index(), on = 'cluster')
return {'snp': pips, 'set': clusters}
def dap_single(x, y, prefix, r, args):
write_dap_full(x,y,prefix,r)
run_dap_full(prefix,args)
return extract_dap_output(prefix)
def dap_single_z(z, ld, prefix, args):
write_dap_ss(z,prefix)
run_dap_z(ld,prefix,args)
return extract_dap_output(prefix)
def dap_batch(X, Y, prefix, *args):
return dict([(f'V{r+1}', dap_single(X, Y[:,r], f'{prefix}_condition_{r+1}', r+1, args)) for r in range(Y.shape[1])])
def dap_batch_z(z, ld, prefix, *args):
return dict([(f'V{r+1}', dap_single_z(z[:,r], ld, f'{prefix}_condition_{r+1}', args)) for r in range(z.shape[1])])
import os
import pickle
import tempfile
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")
input_file = ${_input:ar}
output_file = ${_output:anr}
args = '${args}'
data = pickle.load(open(input_file, 'rb'))['data']
# cache = tempfile.NamedTemporaryFile(suffix = '.dap').name
name = ${out_prefix:r} + '.dap'
posterior = dap_batch(data['X'], data['Y'], name, args)
pickle.dump(posterior, open(output_file + '.pkl', 'wb'))
[dap_2]
output: f'{_input:n}.png'
R: expand = True
library(susieR)
dap = dscrutils::read_dsc({_input:r})
b = dscrutils::read_dsc({data_file:r})$data$true_coef[,{r}]
snp = dap[[{r}]]$snp
pip = snp[order(as.numeric(snp$snp)),]$snp_prob
png({_output:r})
susie_plot(pip, y='PIP', b=b, main = 'DAP-G')
dev.off()
[susie_1]
input: data_file
output: f'{out_prefix}.susie.rds', f'{out_prefix}.susie_pip.png', f'{out_prefix}.susie_z.png'
R: expand = True
library(susieR)
d = dscrutils::read_dsc({_input:r})$data
b = d$true_coef[,{r}]
fitted = susie(d$X, d$Y[,{r}], estimate_prior_variance = TRUE,
estimate_residual_variance=TRUE,
compute_univariate_z = TRUE,
tol=1e-3, track_fit=TRUE)
saveRDS(fitted, {_output[0]:r})
png({_output[1]:r})
susie_plot(fitted,y="PIP",b=b)
dev.off()
png({_output[2]:r})
susie_plot(fitted,y='z',b=b)
dev.off()
d='dsc/susie_comparison/lm_less/liter_data_91_summarize_ld_1_lm_less_2.pkl'
sos run analysis/20180602_DAP_Outlier_Exam.ipynb dap --r 1 --data-file $d \
-b ~/GIT/github/mvarbvs/dsc/modules/linux
sos run analysis/20180602_DAP_Outlier_Exam.ipynb susie --r 1 --data-file $d
DAP output,
%preview /home/gaow/tmp/liter_data_91_summarize_ld_1_lm_less_2_dap_outlier.dap.png
SuSiE output,
%preview /home/gaow/tmp/liter_data_91_summarize_ld_1_lm_less_2_dap_outlier.susie_pip.png
%preview /home/gaow/tmp/liter_data_91_summarize_ld_1_lm_less_2_dap_outlier.susie_z.png
This is a somewhat understandable mistake.
d='dsc/susie_comparison/lm_less/liter_data_148_summarize_ld_1_lm_less_2.pkl'
sos run analysis/20180602_DAP_Outlier_Exam.ipynb dap --r 1 --data-file $d -b ~/GIT/github/mvarbvs/dsc/modules/linux
sos run analysis/20180602_DAP_Outlier_Exam.ipynb susie --r 1 --data-file $d
%preview /home/gaow/tmp/liter_data_148_summarize_ld_1_lm_less_2_dap_outlier.dap.png
%preview /home/gaow/tmp/liter_data_148_summarize_ld_1_lm_less_2_dap_outlier.susie_pip.png
%preview /home/gaow/tmp/liter_data_148_summarize_ld_1_lm_less_2_dap_outlier.susie_z.png
This is a less tolerable mistake.
d='dsc/susie_comparison/lm_less/liter_data_106_summarize_ld_1_lm_less_2.pkl'
sos run analysis/20180602_DAP_Outlier_Exam.ipynb dap --r 1 --data-file $d -b ~/GIT/github/mvarbvs/dsc/modules/linux
sos run analysis/20180602_DAP_Outlier_Exam.ipynb susie --r 1 --data-file $d
This is a case where signals are weak. SuSiE does not report anything usable (huge and messy CS). Although not completely unreasonable, DAP is a bit agressive here and ended up reporting a wrong top variable.
%preview /home/gaow/tmp/liter_data_106_summarize_ld_1_lm_less_2_dap_outlier.dap.png
%preview /home/gaow/tmp/liter_data_106_summarize_ld_1_lm_less_2_dap_outlier.susie_pip.png
%preview /home/gaow/tmp/liter_data_106_summarize_ld_1_lm_less_2_dap_outlier.susie_z.png