Multivariate Bayesian variable selection regression

Workflow to extract PIP and set information for different methods

In [3]:
%revisions -s -n 10
Revision Author Date Message
27f917a Gao Wang 2018-08-16 Plot enrichment results
476d4ee Gao Wang 2018-08-13 Fix coverage pipeline
ded3316 Gao Wang 2018-08-13 Finalize hard case config & fix plot scripts
dc9652f Gao Wang 2018-08-06 Update purity plot workflows
16bc911 Gao Wang 2018-08-06 Update documentation
24b7bf3 Gao Wang 2018-07-13 Update documentation
e624986 Gao Wang 2018-06-29 Use average abs cor insteand of squared
62926ca Gao Wang 2018-06-25 Update coverage results
f96a5ab Gao Wang 2018-06-23 Update power table
b2e364b Gao Wang 2018-06-23 Adjust PIP cutoff for PRC

Previously I've ran this specific DSC using:

dsc susie.dsc --target run_comparison -o susie_comparison

Here I query from the result.

In [2]:
[global]
parameter: cwd = path('~/GIT/github/mvarbvs/dsc')
parameter: date = '1008'
# set to 0 to use estimated prior, or another number to use specified prior
# as configured in the DSC
parameter: susie_prior = [0.1, 0.0]
# null weight options to evaluate
parameter: null_weight = [0.0, 0.5, 0.9, 0.95]
dirname = path(f'{cwd:a}/susie_comparison/')

def fmtP(x):
    return str(x).replace(".", "p")

Get data

In [3]:
[pip_1, power_1, cali_pip_1, coverage_1, roc_1]
output: f'{dirname}/PIP_comparison_{date}.rds'
R: expand = '${ }', workdir = cwd
    dap_out = dscrutils::dscquery(${dirname:br}, 
                        target = "liter_data.dataset liter_data lm_less lm_less.pve lm_less.n_signal fit_dap")
    susie_out = dscrutils::dscquery(${dirname:br}, 
                        target = "liter_data.dataset liter_data lm_less lm_less.pve lm_less.n_signal fit_susie.prior_var fit_susie.null_weight fit_susie")
    caviar_out = dscrutils::dscquery(${dirname:br}, 
                        target = "liter_data.dataset liter_data lm_less lm_less.pve lm_less.n_signal fit_caviar.args fit_caviar")
    finemap_out = dscrutils::dscquery(${dirname:br}, 
                        target = "liter_data.dataset liter_data lm_less lm_less.pve lm_less.n_signal fit_finemap.args fit_finemap")
    saveRDS(list(dap=dap_out, susie=susie_out, caviar=caviar_out, finemap=finemap_out), ${_output:r})

PIP comparison

In [4]:
[pip_2]
input: for_each = ['null_weight', 'susie_prior'], concurrent = True
output: f'{_input:n}_prior_{fmtP(_susie_prior)}_null_{fmtP(_null_weight)}.rds'
R: stdout = f'{_output:n}.log', expand = '${ }', workdir = cwd
    dat = readRDS(${_input:r})
    dap_out = dat$dap
    caviar_out = dat$caviar
    susie_out = dat$susie
    finemap_out = dat$finemap
    # favorit susie flavor
    susie_out = susie_out[which(susie_out$fit_susie.prior_var == ${_susie_prior} & susie_out$fit_susie.null_weight == ${_null_weight}), ]
    susie_out = subset(susie_out, select =-c(lm_less.pve, fit_susie.prior_var, fit_susie.null_weight))

    data_sets = unique(susie_out$liter_data.dataset)
    n_signals = unique(susie_out$lm_less.n_signal)

    result = list()
    for (s in n_signals) {
        result[[as.character(s)]] = NULL
        if (s > 3) {
            has_caviar = FALSE
        } else {
            has_caviar = TRUE
        }
        print(paste('==============', s, '=============='))
        for (d in data_sets) {
            in_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d),c("lm_less.output.file"),drop=FALSE]
            truth = dscrutils::read_dsc(paste0(${dirname:r}, '/', in_files[1,1]))$data$true_coef
            for (r in 1:1) {
                signals = which(truth[,r]!=0)
                out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d),c("fit_susie.output.file"),drop=FALSE]
                susie = dscrutils::read_dsc(paste0(${dirname:r}, '/', out_files[1,1]))$posterior
                susie = susie[[r]]$vars
                # option 1, only compare the mappable set
                # in_CI = susie$variable[which(susie$cs>0)]
                # pip = susie$variable_prob[which(susie$cs>0)]
                # or option 2 compare everything
                in_CI = susie$variable
                pip = susie$variable_prob
                #
                out_files = dap_out[which(dap_out$lm_less.n_signal == s & dap_out$liter_data.dataset == d),c("fit_dap.output.file"),drop=FALSE]
                dap = dscrutils::read_dsc(paste0(${dirname:r}, '/', out_files[1,1]))$posterior
                dap = dap[[as.character(r-1)]]$snp
                #print(head(dap, length(pip)))
                dap = dap[which(dap$snp %in% as.character(in_CI)),]
                dap = dap[match(in_CI, dap$snp),]
                #print(dap)
                #print(pip)        
                #print(in_CI)
                if (has_caviar) {
                    out_files = caviar_out[which(caviar_out$lm_less.n_signal == s & caviar_out$liter_data.dataset == d & caviar_out$fit_caviar.args == paste('-g 0.001 -c', s)),c("fit_caviar.output.file"),drop=FALSE]
                    caviar = dscrutils::read_dsc(paste0(${dirname:r}, '/', out_files[1,1]))$posterior
                    caviar = caviar[[r]]$snp
                    caviar = caviar[which(caviar$snp %in% as.character(in_CI)),]
                    caviar = caviar[match(in_CI, caviar$snp),]
                    out_files = finemap_out[which(finemap_out$lm_less.n_signal == s & finemap_out$liter_data.dataset == d & finemap_out$fit_finemap.args == paste('--n-causal-max', s)),c("fit_finemap.output.file"),drop=FALSE]
                    finemap = dscrutils::read_dsc(paste0(${dirname:r}, '/', out_files[1,1]))$posterior
                    finemap = finemap[[r]]$snp
                    finemap = finemap[which(finemap$snp %in% as.character(in_CI)),]
                    finemap = finemap[match(in_CI, finemap$snp),]
                    pip = cbind(pip, as.vector(dap$snp_prob), as.vector(caviar$snp_prob), as.vector(finemap$snp_prob), in_CI %in% signals)
                } else {
                    pip = cbind(pip, as.vector(dap$snp_prob), in_CI %in% signals)
                }
                # remove all zero PIP / table
                pip = pip[rowSums(pip) > 0, ]
                ## BEGIN debug
                outlier = pip[which(pip[,1] < 0.1 & pip[,2]>0.6), ,drop=F]
                if (nrow(outlier)>0 && s <= 2) {
                  print("DAP outlier")
                  print(c(r,d,in_files))
                }
                if (has_caviar && s == 1) {
                  conflict = pip[which(pip[,1] < 0.95 & pip[,3] > 0.95), ,drop=F]
                  if (nrow(conflict) > 0) {
                      print("CAVIAR-susie conflict")
                      print(d)
                      print(pip[conflict,])
                      print("CAVIAR")
                      print(caviar[which(caviar$snp_prob>0.95),])
                      print("susie")
                      print(susie[which(susie$variable_prob>0.95),])
                  }
                }
                ## END debug  
                if (is.null(result[[as.character(s)]])) {
                    result[[as.character(s)]] = pip
                } else {
                    result[[as.character(s)]] = rbind(result[[as.character(s)]], pip)
                }
            }
        }
        result[[as.character(s)]] = data.frame(result[[as.character(s)]])
        if (has_caviar) {
            colnames(result[[as.character(s)]]) = c('susie', 'dap', 'caviar', 'finemap', 'is_signal')
        } else {
            colnames(result[[as.character(s)]]) = c('susie', 'dap', 'is_signal')
        }
    }
    saveRDS(result, ${_output:r})
In [4]:
[pip_3]
comparisons = ['susie_vs_dap', 'susie_vs_caviar', 'susie_vs_finemap', 'dap_vs_caviar', 'dap_vs_finemap', 'caviar_vs_finemap']
output: paths([f'{_input:n}.{x}.png' for x in comparisons])
R: expand = '${ }'
    result = readRDS(${_input:r})
    merge_img = function(prefix, n) {
        files = paste0(prefix, '_', seq(1:n), '.png')
        cmd = paste('convert +append', paste(files, collapse=" "), paste0(prefix, '.png'))
        system(cmd)
        system(paste('rm -f', paste(files, collapse=" ")))
        files = paste0(prefix, '_', seq(1:n), '.md')
        cmd = paste('cat', paste(files, collapse=" "), '>', paste0(prefix, '.md'))
        system(cmd)
        system(paste('rm -f', paste(files, collapse=" ")))
    }

    merge_lists = function(lists) {
        lists = do.call(rbind,lapply(cbind(lists), unlist))
        names = colnames(lists)
        lists = lapply(1:ncol(lists), function(i) lists[,i])
        names(lists) = names
        return(lists)
    }

    plot_pip = function(x, n_causal, s, output_prefix, xname, yname, xlab, ylab, pip_cutoff = -1, pip_diff_categories = c(0.1, 0.15, 0.2)) {
      x = x[x[[xname]] > pip_cutoff & x[[yname]] > pip_cutoff,]
      if (pip_cutoff >=0) {
          xlab = paste0(xlab, ">", pip_cutoff)
          ylab = paste0(ylab, ">", pip_cutoff)
      }
      colors = sapply(1:length(x$is_signal), function(i) ifelse(x$is_signal[i],'#800000','#d6d6ce'))
      # compute difference in pip and proportions
      pip_diff = abs(x[[xname]] - x[[yname]])
      pip_diff_cnts = sapply(pip_diff_categories, function(x) length(which(pip_diff >= x)))
      pip_diff_text = vector()
      for (i in 1:length(pip_diff_categories)) {
          tmp = paste0('- ', pip_diff_cnts[i], '/', length(pip_diff), ' (', formatC(pip_diff_cnts[i] / length(pip_diff) * 100, format = "e", digits = 2), '%) differ by ', pip_diff_categories[i])
          pip_diff_text = c(pip_diff_text, tmp)
      }
      pip_diff_text = paste(pip_diff_text, collapse = '\n')
      corr_text = paste('- correlation', round(cor(x)[1,2],2))
      header = paste('#', xname, 'vs', yname, s, n_causal, 'causal\n')
      pdf(paste0(output_prefix, '_', n_causal, '.pdf'), width=5, height=5, pointsize=14)
      plot(x[[xname]][which(x$is_signal==0)], x[[yname]][which(x$is_signal==0)], xlab = xlab, ylab = ylab, xlim = c(0,1), ylim = c(0,1),
         main = paste0(n_causal, s , ifelse(n_causal>1, ' effect variables', ' effect variable')),
         col = '#d6d6ce', pch = 20, cex = 1.4, bty='l')
      points(x[[xname]][which(x$is_signal==1)], x[[yname]][which(x$is_signal==1)], col='#800000', pch=20, cex=1.4)
      abline(0,1,col=2)
      #abline(h=0.95, col='gray')
      #abline(v=0.95, col='gray')
      dev.off()
      system(paste0("convert -flatten -density 120 ", paste0(output_prefix, '_', n_causal, '.pdf'), " ", paste0(output_prefix, '_', n_causal, '.png')))
      write(paste(header, corr_text, pip_diff_text, '\n', sep = '\n'), paste0(output_prefix, '_', n_causal, '.md'))
    }
    result_1 = result[[1]]
    result_2 = result[[2]]
    result_3 = result[[3]]
    result_5 = do.call(rbind, lapply(3:5, function(i) result[[i]][,c('susie','dap', 'is_signal')]))
    # susie vs dap
    plot_pip(result_1, 1, '', ${_output[0]:nr}, 'susie', 'dap', 'PIP SuSiE', 'PIP DAP-G')
    plot_pip(result_2, 2, '', ${_output[0]:nr}, 'susie', 'dap', 'PIP SuSiE', 'PIP DAP-G')
    plot_pip(result_5, 3, ' ~ 5', ${_output[0]:nr}, 'susie', 'dap', 'PIP SuSiE', 'PIP DAP-G')
    merge_img(${_output[0]:nr}, 3)
    # susie vs caviar
    plot_pip(result_1, 1, '', ${_output[1]:nr}, 'susie', 'caviar', 'PIP SuSiE', 'PIP CAVIAR')
    plot_pip(result_2, 2, '', ${_output[1]:nr}, 'susie', 'caviar', 'PIP SuSiE', 'PIP CAVIAR')
    plot_pip(result_3, 3, '', ${_output[1]:nr}, 'susie', 'caviar', 'PIP SuSiE', 'PIP CAVIAR')
    merge_img(${_output[1]:nr}, 3)
    # susie vs finemap
    plot_pip(result_1, 1, '', ${_output[2]:nr}, 'susie', 'finemap', 'PIP SuSiE', 'PIP FINEMAP')
    plot_pip(result_2, 2, '', ${_output[2]:nr}, 'susie', 'finemap', 'PIP SuSiE', 'PIP FINEMAP')
    plot_pip(result_3, 3, '', ${_output[2]:nr}, 'susie', 'finemap', 'PIP SuSiE', 'PIP FINEMAP')
    merge_img(${_output[2]:nr}, 3)
    # dap vs caviar
    plot_pip(result_1, 1, '', ${_output[3]:nr}, 'dap', 'caviar', 'PIP DAP-G', 'PIP CAVIAR')
    plot_pip(result_2, 2, '', ${_output[3]:nr}, 'dap', 'caviar', 'PIP DAP-G', 'PIP CAVIAR')
    plot_pip(result_3, 3, '', ${_output[3]:nr}, 'dap', 'caviar', 'PIP DAP-G', 'PIP CAVIAR')
    merge_img(${_output[3]:nr}, 3)
    # dap vs finemap
    plot_pip(result_1, 1, '', ${_output[4]:nr}, 'dap', 'finemap', 'PIP DAP-G', 'PIP FINEMAP')
    plot_pip(result_2, 2, '', ${_output[4]:nr}, 'dap', 'finemap', 'PIP DAP-G', 'PIP FINEMAP')
    plot_pip(result_3, 3, '', ${_output[4]:nr}, 'dap', 'finemap', 'PIP DAP-G', 'PIP FINEMAP')
    merge_img(${_output[4]:nr}, 3)
    # caviar vs finemap
    plot_pip(result_1, 1, '', ${_output[5]:nr}, 'caviar', 'finemap', 'PIP CAVIAR', 'PIP FINEMAP')
    plot_pip(result_2, 2, '', ${_output[5]:nr}, 'caviar', 'finemap', 'PIP CAVIAR', 'PIP FINEMAP')
    plot_pip(result_3, 3, '', ${_output[5]:nr}, 'caviar', 'finemap', 'PIP CAVIAR', 'PIP FINEMAP')
    merge_img(${_output[5]:nr}, 3)

Summary of discovery

In [5]:
[power_2]
# Power analysis
# to match with DAP -ld_control 0.25
# to match with susie 95% mappable CS, we set dap cutoff to 0.95 also
dap_cluster_cutoff = [('cluster_prob', 0.95), ('cluster_avg_r2', 0.25)]
input: for_each = ['dap_cluster_cutoff', 'null_weight', 'susie_prior'], group_by = 1, concurrent = True
output: f'{dirname}/Power_comparison_{date}_{_dap_cluster_cutoff[0]}_prior_{fmtP(_susie_prior)}_null_{fmtP(_null_weight)}.rds'
R: stdout = f'{_output:n}.log', expand = '${ }', workdir = cwd
    dat = readRDS(${_input:r})
    dap_out = dat$dap
    susie_out = dat$susie
    # favorite susie flavor
    susie_out = susie_out[which(susie_out$fit_susie.prior_var == ${_susie_prior} & susie_out$fit_susie.null_weight == ${_null_weight}), ]
    susie_out = subset(susie_out, select =-c(lm_less.pve, fit_susie.prior_var, fit_susie.null_weight))
    data_sets = unique(susie_out$liter_data.dataset)
    n_signals = unique(susie_out$lm_less.n_signal)
    n_r = 2
    n_experiments = n_r * length(data_sets)
    result = NULL
    overlap_cs = list()
    for (s in n_signals) {
        susie_signals = 0
        dap_signals = 0
        susie_size = 0
        dap_size = 0
        # I cannot find a good median tracker so do it stupid way: save all and take median later
        susie_sizes = vector()
        dap_sizes = vector()
        # do the same for mean ...
        susie_avg_ld = vector()
        dap_avg_ld = vector()
        susie_tdc = 0
        dap_tdc = 0
        susie_dc = 0
        dap_dc = 0
        susie_tc = 0
        dap_tc = 0
        overlap_cs[[as.character(s)]] = NULL
        for (d in data_sets) {
            out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d), c("lm_less.output.file"),drop=FALSE]
            truth = dscrutils::read_dsc(paste0(${dirname:r}, '/', out_files[1,1]))$data$true_coef
            dap_files = dap_out[which(dap_out$lm_less.n_signal == s & dap_out$liter_data.dataset == d),c("fit_dap.output.file"),drop=FALSE]
            dap = dscrutils::read_dsc(paste0(${dirname:r}, '/', dap_files[1,1]))$posterior
            susie_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d),c("fit_susie.output.file"),drop=FALSE]
            susie = dscrutils::read_dsc(paste0(${dirname:r}, '/', susie_files[1,1]))$posterior
            for (r in 1:n_r) {
                signals = which(truth[,r]!=0)
                # SuSiE in CS
                susie_cs = list()
                if (!is.null(susie[[r]]$cs)) {
                  for (i in 1:nrow(susie[[r]]$cs)) {
                      susie_cs[[i]] = as.integer(unlist(strsplit(susie[[r]]$cs$variable[i], ",")))
                      if (any(signals %in% susie_cs[[i]])) {
                          susie_size = susie_size + length(susie_cs[[i]])
                          susie_sizes = c(susie_sizes, length(susie_cs[[i]]))
                          susie_avg_ld = c(susie_avg_ld, sqrt(susie[[r]]$cs$cs_avg_r2[i]))
                          susie_tdc = susie_tdc + 1
                          susie_dc = susie_dc + 1
                      }
                    }
                    susie_signals = susie_signals + sum(signals %in% unique(unlist(susie_cs)))
                }
                # check susie CS overlapping status
                if (length(susie_cs) > 0) {
                for (i in 1:length(susie_cs)) {
                  for (j in 1:i) {
                      if (i == j) next
                      status = intersect(susie_cs[[i]], susie_cs[[j]])
                      if (length(status)>0) {
                          if (is.null(overlap_cs[[as.character(s)]])) overlap_cs[[as.character(s)]] = c(d, r, length(susie_cs[[i]]), length(susie_cs[[j]]), length(status))
                          else overlap_cs[[as.character(s)]] = rbind(overlap_cs[[as.character(s)]], c(d, r, length(susie_cs[[i]]), length(susie_cs[[j]]), length(status)))
                      }
                    }
                  }
                }
                print(paste('==============', s, '=============='))
                print(susie_cs)
                susie_tc = susie_tc + length(susie_cs)
                # DAP in cluster
                dap_cluster_raw = dap[[as.character(r-1)]]$set[which(dap[[as.character(r-1)]]$set$${_dap_cluster_cutoff[0]} > ${_dap_cluster_cutoff[1]}), ]
                dap_cluster_ld = dap_cluster_raw$cluster_avg_r2
                dap_cluster_raw = dap_cluster_raw$snp
                dap_cluster = list()
                if (length(dap_cluster_raw) > 0) {
                    for (i in 1:length(dap_cluster_raw)) {
                      dap_cluster[[i]] = as.integer(unlist(strsplit(dap_cluster_raw[i], ",")))
                      if (any(signals %in% dap_cluster[[i]])) {
                          dap_size = dap_size + length(dap_cluster[[i]])
                          dap_sizes = c(dap_sizes, length(dap_cluster[[i]]))
                          dap_avg_ld = c(dap_avg_ld, sqrt(dap_cluster_ld[i]))
                          dap_tdc = dap_tdc + 1
                          dap_dc = dap_dc + 1
                      }
                    }
                    dap_signals = dap_signals + sum(signals %in% unique(unlist(dap_cluster)))
                }
                print(dap_cluster)
                dap_tc = dap_tc + length(dap_cluster)
                ## BEGIN debug
                ## susie made more true discovery than DAP
                if (susie_dc > dap_dc) {
                  print('DAP miss')
                  print(dap[[as.character(r-1)]]$set)
                  print(d)
                  print(r)
                  print(dap_files)
                }
                ## DAP made false discovery, susie did not
                ## under n = 1
                if (length(dap_cluster) > dap_dc && s == 1) {
                  print('DAP false discovery')
                  print(dap[[as.character(r-1)]]$set)
                  print(d)
                  print(r)
                  print(dap_files)
                }
                ## SuSiE made false discovery
                ## under n = 3
                if (length(susie_cs) > susie_dc && s == 3) {
                  print('SuSiE false discovery in data ...')
                  print(d)
                  print(r)
                  print(length(susie_cs))
                  print(susie_dc)
                  print(susie_files)
                }  
                ## END debug
                susie_dc = 0
                dap_dc = 0
            }
        }
        rates = c(s, s*n_experiments, susie_tc, dap_tc, susie_signals/s/n_experiments, dap_signals/s/n_experiments, susie_tdc/susie_tc, dap_tdc/dap_tc, susie_size / susie_tdc, dap_size / dap_tdc, median(susie_sizes), median(dap_sizes), mean(susie_avg_ld,na.rm=T), mean(dap_avg_ld))
        if (is.null(result)) {
          result = rates
        } else {
          result = rbind(result, rates)
        }
    }
    colnames(result) = c('n_signal', 'expected_discoveries', 'susie_discoveries', 'dap_discoveries', 'susie_power', 'dap_power', 'susie_coverage', 'dap_coverage', 'susie_avg_size', 'dap_avg_size', 'susie_median_size', 'dap_median_size', 'susie_avg_ld', 'dap_avg_ld')
    rownames(result) = as.character(result[,1])
    saveRDS(data.frame(result), ${_output:r})
    saveRDS(overlap_cs, "${_output:n}.overlap_status.rds")

PIP calibration

In [ ]:
[cali_pip_2]
bin_size = 20
input: for_each = ['null_weight', 'susie_prior'], concurrent = True
output: f'{_input:n}_calibrated_prior_{fmtP(_susie_prior)}_null_{fmtP(_null_weight)}.rds'
R: stdout = f'{_output:n}.log', expand = '${ }', workdir = cwd
    pip_cutoff = 0
    dat = readRDS(${_input:r})
    dap_out = dat$dap
    caviar_out = dat$caviar
    finemap_out = dat$finemap
    susie_out = dat$susie
    # favorite susie flavor
    susie_out = susie_out[which(susie_out$fit_susie.prior_var == ${_susie_prior} & susie_out$fit_susie.null_weight == ${_null_weight}), ]
    susie_out = subset(susie_out, select =-c(lm_less.pve, fit_susie.prior_var, fit_susie.null_weight))

    data_sets = unique(susie_out$liter_data.dataset)
    n_signals = unique(susie_out$lm_less.n_signal)
  
    result = list()
    pip_cali = list()
    for (s in n_signals) {
        result[[as.character(s)]] = NULL
        pip_cali[[as.character(s)]] = list(susie = c(0,0,0), dap = c(0,0,0), caviar = c(0,0,0))
        if (s > 3) {
            has_caviar = FALSE
        } else {
            has_caviar = TRUE
        }
        for (d in data_sets) {
            out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d),c("lm_less.output.file"),drop=FALSE]
            truth = dscrutils::read_dsc(paste0(${dirname:r}, '/', out_files[1,1]))$data$true_coef
            for (r in c(1,2)) {
                signals = truth[,r]
                signals[which(signals!=0)] = 1
                out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d),c("fit_susie.output.file"),drop=FALSE]
                susie = dscrutils::read_dsc(paste0(${dirname:r}, '/', out_files[1,1]))$posterior
                snp = susie[[r]]$vars
                # do not restrict PIP
                in_CI = 1:nrow(snp)
                snp = snp[which(snp$variable %in% in_CI),]
                snp = snp[match(in_CI, snp$variable),]
                #print(head(snp))
                susie = as.vector(snp$variable_prob)
                dap = as.vector(snp$snp_prob)
                out_files = dap_out[which(dap_out$lm_less.n_signal == s & dap_out$liter_data.dataset == d),c("fit_dap.output.file"),drop=FALSE]
                dap = dscrutils::read_dsc(paste0(${dirname:r}, '/', out_files[1,1]))$posterior
                snp = dap[[as.character(r-1)]]$snp
                snp = snp[which(snp$snp %in% as.character(in_CI)),]
                snp = snp[match(in_CI, snp$snp),]
                #print(head(snp))
                dap = as.vector(snp$snp_prob)
                if (has_caviar) {
                    # caviar
                    out_files = caviar_out[which(caviar_out$lm_less.n_signal == s & caviar_out$liter_data.dataset == d & caviar_out$fit_caviar.args == paste('-g 0.001 -c', s)),c("fit_caviar.output.file"),drop=FALSE]
                    caviar = dscrutils::read_dsc(paste0(${dirname:r}, '/', out_files[1,1]))$posterior
                    snp = caviar[[r]]$snp
                    snp = snp[which(snp$snp %in% as.character(in_CI)),]
                    snp = snp[match(in_CI, snp$snp),]
                    #print(head(snp))
                    caviar = as.vector(snp$snp_prob)
                    # finemap
                    out_files = finemap_out[which(finemap_out$lm_less.n_signal == s & finemap_out$liter_data.dataset == d & finemap_out$fit_finemap.args == paste('--n-causal-max', s)),c("fit_finemap.output.file"),drop=FALSE]
                    finemap = dscrutils::read_dsc(paste0(${dirname:r}, '/', out_files[1,1]))$posterior
                    snp = finemap[[r]]$snp
                    snp = snp[which(snp$snp %in% as.character(in_CI)),]
                    snp = snp[match(in_CI, snp$snp),]
                    #print(head(snp))
                    finemap = as.vector(snp$snp_prob)  
                    pip = cbind(susie, dap, caviar, finemap, signals[in_CI])
                } else {
                    pip = cbind(susie, dap, signals[in_CI])
                }
                if (is.null(result[[as.character(s)]])) {
                    result[[as.character(s)]] = pip
                } else {
                    result[[as.character(s)]] = rbind(result[[as.character(s)]], pip)
                }
            }
        }
        # make data frame
        res = data.frame(result[[as.character(s)]])
        if (has_caviar) {
            colnames(res) = c('susie', 'dap', 'caviar', 'finemap', 'is_signal')
            names = c('susie', 'dap', 'caviar', 'finemap')
        } else {
            colnames(res) = c('susie', 'dap', 'is_signal')
            names = c('susie', 'dap')
        }
        # make bins
        bins = cbind(seq(1:${bin_size})/${bin_size}-1/${bin_size}, seq(1:${bin_size})/${bin_size})
        for (name in names) {
            for (i in 1:nrow(bins)) {
              tmp = res[which(res[[name]] > bins[i,1] & res[[name]] < bins[i,2]),]
              pip_cali[[as.character(s)]][[name]] = rbind(pip_cali[[as.character(s)]][[name]], c(sum(tmp[[name]]), sum(tmp$is_signal), length(tmp$is_signal)))
          }
        pip_cali[[as.character(s)]][[name]][which(is.na(pip_cali[[as.character(s)]][[name]]))] = 0
        }
    }
    susie = pip_cali[[as.character(1)]]$susie
    for (i in 2:5) susie = susie + pip_cali[[as.character(i)]]$susie
    dap = pip_cali[[as.character(1)]]$dap
    for (i in 2:5) dap = dap + pip_cali[[as.character(i)]]$dap
    cav = pip_cali[[as.character(1)]]$cav
    for (i in 2:3) cav = cav + pip_cali[[as.character(i)]]$cav
    finemap = pip_cali[[as.character(1)]]$finemap
    for (i in 2:3) finemap = finemap + pip_cali[[as.character(i)]]$finemap
    susie[,c(1,2)] = susie[,c(1,2)] / susie[,3]
    dap[,c(1,2)] = dap[,c(1,2)] / dap[,3]
    cav[,c(1,2)] = cav[,c(1,2)] / cav[,3]
    finemap[,c(1,2)] = finemap[,c(1,2)] / finemap[,3]
    saveRDS(list("SuSiE"=susie[-1,], "DAP-G"=dap[-1,], "CAVIAR"=cav[-1,], "FINEMAP"=finemap[-1,]), ${_output:r})
In [ ]:
[cali_pip_3]
depends: executable('convert')
input: group_by = 1, concurrent = True
output: f'{_input:n}.png'
R: expand = '${ }'
    library(ggplot2)
    library(cowplot)
    dot_plot = function(dataframe) {
        ggplot(dataframe, aes(x=mean_pip, y=observed_freq)) + 
          geom_errorbar(aes(ymin=observed_freq-se, ymax=observed_freq+se), colour="gray", size = 0.2, width=.01) +
          geom_point(size=1.5, shape=21, fill="#002b36") + # 21 is filled circle
          xlab("Mean PIP") +
          ylab("Observed frequency") +
          coord_cartesian(ylim=c(0,1), xlim=c(0,1)) +
          geom_abline(slope=1,intercept=0,colour='red', size=0.2) +
          ggtitle(name) +
          expand_limits(y=0) +                        # Expand y range
          theme_cowplot()
    }

    dat = readRDS(${_input:r})
    idx = 0
    for (name in names(dat)) {
      idx = idx + 1
      dat[[name]][,3] = sqrt(dat[[name]][,2] * (1 - dat[[name]][,2]) / dat[[name]][,3]) * 2
      dat[[name]] = as.data.frame(dat[[name]])
      colnames(dat[[name]]) = c("mean_pip", "observed_freq", "se")
      pdf(paste0(${_output:nr}, '_' , idx, '.pdf'), width=3, height=3, pointsize=16)
      print(dot_plot(dat[[name]]))
      dev.off()
      system(paste0("convert -density 120 ", ${_output:nr}, '_' , idx, '.pdf', " ", ${_output:nr}, '_' , idx, '.png'))
    }
    files = paste0(${_output:nr}, '_', seq(1:idx), '.png')
    cmd = paste('convert +append', paste(files, collapse=" "), ${_output:r})
    system(cmd)
    system(paste('rm -f', paste(files, collapse=" ")))

Coverage

In [ ]:
[coverage_2]
ld_cutoff = 0.5
parameter: n_signals = 5
input: for_each = ['susie_prior'], concurrent = True
output: f'{dirname}/Coverage_{date}_prior_{fmtP(_susie_prior)}_null_{fmtP(null_weight[0])}_signals_{n_signals}.rds'

R: stdout = f'{_output:n}.log', expand = '${ }', workdir = cwd
    ld_cutoff = ${ld_cutoff}
    dat = readRDS(${_input:r})
    susie_out = dat$susie
    # favorit susie flavor
    susie_out = susie_out[which(susie_out$fit_susie.prior_var == ${_susie_prior} & susie_out$fit_susie.null_weight == ${null_weight[0]}), ]
    susie_out = subset(susie_out, select =-c(lm_less.pve, fit_susie.prior_var, fit_susie.null_weight))

    data_sets = unique(susie_out$liter_data.dataset)
    if (is.null(${n_signals})) {
        n_signals = unique(susie_out$lm_less.n_signal)
    } else {
        n_signals = 1:${n_signals}
    }
    positives = list()
    for (s in n_signals) {
        positives[[as.character(s)]] = list()
        print(paste('========', s, '========'))
        for (d in data_sets) {
            out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d),c("fit_susie.output.file", "liter_data.output.file", "lm_less.output.file")]
            fit = dscrutils::read_dsc(paste0(${dirname:r}, '/', out_files[1,1]))$fit
            X = dscrutils::read_dsc(paste0(${dirname:r}, '/', out_files[1,2]))$data$X
            truth = dscrutils::read_dsc(paste0(${dirname:r}, '/', out_files[1,3]))$data$true_coef
            for (r in c(1,2)) {
                signals = which(truth[,r]!=0)
                susie_cs_all = list()
                for (level in c(0.01, 0.05, 0.1, 0.15, 0.2, 0.25)) {
                  susie_cs_all[[as.character(level*100)]] = susieR::susie_get_CS(fit[[r]], coverage=1-level, X = X, min_abs_corr = ld_cutoff)$cs
                  if (! as.character(level*100) %in% names(positives[[as.character(s)]])) {
                      positives[[as.character(s)]][[as.character(level*100)]] = c(0,0)
                  }
                  if (length(susie_cs_all[[as.character(level*100)]]) > 0) {
                      for (i in 1:length(susie_cs_all[[as.character(level*100)]])) {
                          susie_cs = susie_cs_all[[as.character(level*100)]][[i]]
                          if (length(susie_cs) == 0) {
                              next
                          }
                          if (any(signals %in% susie_cs)) {
                              positives[[as.character(s)]][[as.character(level*100)]][1] = positives[[as.character(s)]][[as.character(level*100)]][1] + 1
                          } else {
                              print(paste(d, r, level, i))
                              print(susie_cs)
                              print(signals)
                              positives[[as.character(s)]][[as.character(level*100)]][2] = positives[[as.character(s)]][[as.character(level*100)]][2] + 1
                          }
                      }            
                  }
                }
            }
        }
        print(positives[[as.character(s)]])
    }
    saveRDS(positives, ${_output:r})
In [ ]:
[coverage_3]
parameter: n_signals = 5
output: f'{dirname}/Coverage_{date}_null_weight_{fmtP(null_weight[0])}_signals_{n_signals}.png'
R: expand = '${ }'
    get_summary = function(dat, names) {
        res = c(0,0)
        for (level in c(0.01, 0.05, 0.1, 0.15, 0.2, 0.25)) {
            s = as.character(level*100)
            d = do.call(rbind, lapply(1:length(dat), function(i) dat[[as.character(i)]][[s]]))
            d = colSums(d)
            res = rbind(res, c(level, d[2]/(d[1]+d[2])))
        }
        res = data.frame(res[-1,])
        colnames(res) = names
        res
    }
    dat1 = get_summary(readRDS(${_input[0]:r}), c('expected', 'fdp_fixed_prior'))
    dat2 = get_summary(readRDS(${_input[-1]:r}), c('expected', 'fdp_estimated_prior'))
    dat = merge(dat1,dat2,by='expected')
    lower = min(1 - dat$fdp_estimated_prior, 1 - dat$fdp_fixed_prior, 1 - dat$expected)
    upper = max(1 - dat$fdp_estimated_prior, 1 - dat$fdp_fixed_prior, 1 - dat$expected)
    pdf("${_output:n}.pdf", width=5, height=5, pointsize=15)
    plot(1 - dat$expected, 1 - dat$fdp_fixed_prior, t="l", xlim = c(lower,upper), ylim = c(lower,upper),
         col='#A60628', ylab = "observed coverage", xlab ="expected coverage", main = paste("1 ~", ${n_signals}, "effect variables"), bty='l')
    lines(1 - dat$expected, 1 - dat$fdp_estimated_prior, xlim = c(lower,upper), ylim = c(lower,upper), col='#348ABD')
    legend("bottomright", legend=c("SuSiE (fixed prior)", "SuSiE (estimated prior)"),
           col=c('#A60628', '#348ABD'), lty=c(1,1), cex=0.8)
    abline(0,1,col='gray')
    dev.off()
    system(paste0("convert -density 120 ", ${_output:nr}, '.pdf', " ", ${_output:r}))

ROC

In [48]:
[roc_2]
pip_cutoff = 0.3
dap_cluster_cutoff = ('cluster_prob', 0.95)
input: for_each = 'null_weight', concurrent = True
output: f'{dirname}/ROC_{date}_prior_{fmtP(susie_prior[0])}_null_{fmtP(_null_weight)}_two.rds', f'{dirname}/ROC_{date}_prior_{fmtP(susie_prior[0])}_null_{fmtP(_null_weight)}_all.rds'
R: stdout = f'{_output[0]:n}.log', expand = '${ }', workdir = cwd
    dat = readRDS(${_input:r})
    dap_out = dat$dap
    susie_out = dat$susie
    caviar_out = dat$caviar
    finemap_out = dat$finemap
    # favorit susie flavor
    susie_out = susie_out[which(susie_out$fit_susie.prior_var == ${susie_prior[0]} & susie_out$fit_susie.null_weight == ${_null_weight}), ]
    susie_out = subset(susie_out, select =-c(lm_less.pve, fit_susie.prior_var, fit_susie.null_weight))

    data_sets = unique(susie_out$liter_data.dataset)
    n_signals = unique(susie_out$lm_less.n_signal)

    result = list()
    result_all = list()
    for (s in n_signals) {
        result[[as.character(s)]] = list(susie=NULL, dap=NULL)
        result_all[[as.character(s)]] = list(susie=NULL, dap=NULL, caviar=NULL, finemap=NULL)
        for (d in data_sets) {
            out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d),c("lm_less.output.file"),drop=FALSE]
            truth = dscrutils::read_dsc(paste0(${dirname:r}, '/', out_files[1,1]))$data$true_coef
            for (r in 1:2) {
                signals = truth[,r]
                signals[which(signals!=0)] = 1
                # susie
                out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d),c("fit_susie.output.file"),drop=FALSE]
                susie = dscrutils::read_dsc(paste0(${dirname:r}, '/', out_files[1,1]))$posterior
                pip = susie[[r]]$vars
                pip = pip[order(as.numeric(pip$variable)),]$variable_prob                
                if (is.null(result[[as.character(s)]]$susie)) {
                    result[[as.character(s)]]$susie = cbind(pip, signals)
                    if (s <= 3) result_all[[as.character(s)]]$susie = cbind(pip, signals)
                } else {
                    result[[as.character(s)]]$susie = rbind(result[[as.character(s)]]$susie, cbind(pip, signals))
                    if (s <= 3) result_all[[as.character(s)]]$susie = rbind(result_all[[as.character(s)]]$susie, cbind(pip, signals))
                }
                # dap
                out_files = dap_out[which(dap_out$lm_less.n_signal == s & dap_out$liter_data.dataset == d),c("fit_dap.output.file"),drop=FALSE]
                dap = dscrutils::read_dsc(paste0(${dirname:r}, '/', out_files[1,1]))$posterior
                pip = dap[[as.character(r-1)]]$snp
                pip = pip[order(as.numeric(pip$snp)),]$snp_prob
                if (is.null(result[[as.character(s)]]$dap)) {
                    result[[as.character(s)]]$dap = cbind(pip, signals)
                    if (s <= 3) result_all[[as.character(s)]]$dap = cbind(pip, signals)
                } else {
                    result[[as.character(s)]]$dap = rbind(result[[as.character(s)]]$dap, cbind(pip, signals))
                    if (s <= 3) result_all[[as.character(s)]]$dap = rbind(result_all[[as.character(s)]]$dap, cbind(pip, signals))
                }
                if (s <= 3) {
                  # CAVIAR
                  out_files = caviar_out[which(caviar_out$lm_less.n_signal == s & caviar_out$liter_data.dataset == d & caviar_out$fit_caviar.args == paste('-g 0.001 -c', s)),c("fit_caviar.output.file"),drop=FALSE]
                  caviar = dscrutils::read_dsc(paste0(${dirname:r}, '/', out_files[1,1]))$posterior
                  pip = caviar[[r]]$snp
                  pip = pip[order(as.numeric(pip$snp)),]$snp_prob
                  if (is.null(result_all[[as.character(s)]]$caviar)) {
                    result_all[[as.character(s)]]$caviar = cbind(pip, signals)
                  } else {
                    result_all[[as.character(s)]]$caviar = rbind(result_all[[as.character(s)]]$caviar, cbind(pip, signals))
                  }
                  # FINEMAP
                  out_files = finemap_out[which(finemap_out$lm_less.n_signal == s & finemap_out$liter_data.dataset == d & finemap_out$fit_finemap.args == paste('--n-causal-max', s)),c("fit_finemap.output.file"),drop=FALSE]
                  finemap = dscrutils::read_dsc(paste0(${dirname:r}, '/', out_files[1,1]))$posterior
                  pip = finemap[[r]]$snp
                  pip = pip[order(as.numeric(pip$snp)),]$snp_prob
                  if (is.null(result_all[[as.character(s)]]$finemap)) {
                    result_all[[as.character(s)]]$finemap = cbind(pip, signals)
                  } else {
                    result_all[[as.character(s)]]$finemap = rbind(result_all[[as.character(s)]]$finemap, cbind(pip, signals))
                  }  
                }
            }
        }
    }
  
    roc_data = function(d1, cutoff = c(${pip_cutoff}, 1), connect_org = TRUE) {
        grid = 500
        ttv = seq(1:grid)/grid
        ttv = ttv[which(ttv>=cutoff[1] & ttv<=cutoff[2])]
        # see SuSiE-Manuscript issue 2
        d1 = d1[order(d1[,1]), ]
        end = tail(d1[which(d1[,2] == 0),][,1],1)
        ttv = c(ttv[-length(ttv)], min(ttv[length(ttv)], end))
        # end of issue 2
        rst1 = t(sapply(ttv, function(x) c(sum(d1[,2][d1[,1]>=x]), length(d1[,2][d1[,1]>=x]))))
        rst1 = cbind(rst1, sum(d1[,2]))
        if (connect_org) {
            # connect to origin
            last_row = tail(rst1, 1)
            rst1 = rbind(rst1, c(last_row[1], last_row[2]-1, last_row[3]), c(0.001,0.001,last_row[3]))
        }
        rst1 = as.data.frame(rst1)
        colnames(rst1) = c('true_positive', 'total_positive', 'total_signal')
        if (connect_org) {
            rst2 = as.data.frame(cbind(rst1$true_positive / rst1$total_positive, rst1$true_positive / rst1$total_signal,  c(ttv, ttv[length(ttv)], 1)))
        } else {
            rst2 = as.data.frame(cbind(rst1$true_positive / rst1$total_positive, rst1$true_positive / rst1$total_signal,  ttv))
        }
        colnames(rst2) = c('Precision', 'Recall', 'Threshold')
        return(list(counts = rst1, rates = rst2))
    }
    saveRDS(result, "${_output[0]:n}.data.rds")
    saveRDS(result_all, "${_output[1]:n}.data.rds")
    print("Computing ROC data ...")
    susie = roc_data(do.call(rbind, lapply(1:length(result), function(i) result[[i]]$susie)))
    dap = roc_data(do.call(rbind, lapply(1:length(result), function(i) result[[i]]$dap)), cutoff = c(${pip_cutoff}, 0.9999), connect_org = FALSE) # DAP output format has numerical artifect
    saveRDS(list(data = result, susie = susie, dap = dap), ${_output[0]:r})
    #
    susie = roc_data(do.call(rbind, lapply(1:length(result_all), function(i) result_all[[i]]$susie)))
    dap = roc_data(do.call(rbind, lapply(1:length(result_all), function(i) result_all[[i]]$dap)), cutoff = c(${pip_cutoff}, 0.9999), connect_org = FALSE)
    caviar = roc_data(do.call(rbind, lapply(1:length(result_all), function(i) result_all[[i]]$caviar)))
    finemap = roc_data(do.call(rbind, lapply(1:length(result_all), function(i) result_all[[i]]$finemap)))
    saveRDS(list(data = result_all, susie = susie, dap = dap, finemap=finemap, caviar=caviar), ${_output[1]:r})
In [ ]:
[roc_3]
parameter: chunks = 0
parameter: smooth = 'FALSE'
parameter: opt1 = "lwd = 2, xlim = c(0,0.2), ylim = c(0,0.2)"
parameter: opt2 = "lwd = 2, xlim = c(0,0.3), ylim = c(0,0.3)"
output: f'{dirname}/ROC_{date}_prior_{fmtP(susie_prior[0])}.png'
R: expand = '${ }'
    colors = c('#A60628', '#7A68A6', '#348ABD', '#467821', '#FF0000', '#188487', '#E2A233',
                  '#A9A9A9', '#000000', '#FF00FF', '#FFD700', '#ADFF2F', '#00FFFF')
    d2 = readRDS(${_input[2*len(null_weight)-2]:r})
    d1 = readRDS(${_input[0]:r})
    d4 = readRDS(${_input[2*len(null_weight)-1]:r})
    d3 = readRDS(${_input[1]:r})
    library(scam)
    create_chunks = function(item, n) {
      splitted = suppressWarnings(split(item, 1:n))
      return(c(splitted[[1]], splitted[[length(splitted)]][length(splitted[[length(splitted)]])]))
    }
    make_smooth = function(x,y,subset=${chunks}, smooth = ${smooth}) {
      if (smooth) {
          if (subset < length(x) && subset > 0) {
              x = create_chunks(x, subset)
              y = create_chunks(y, subset)
          }
          dat = data.frame(cbind(x,y))
          colnames(dat) = c('x','y')
          y=predict(scam(y ~ s(x, bs = "mpi"), data = dat))
      }
      return(list(x=x,y=y))
    }
    add_text = function(thresholds, x, y, threshold, color, delta = 0.015) {
        idx = which(thresholds == threshold)
        text(x[idx] - delta, y[idx], labels = threshold, col = color)
        points(x[idx],y[idx])
    }
    pdf("${_output:n}.part1.pdf", width=5, height=5, pointsize=15)
    yy = make_smooth(1 - d1$susie$rates$Precision, d1$susie$rates$Recall)
    plot(yy$x, yy$y, t="l", col=colors[1], ylab = "power", xlab ="FDR", main = "1 ~ 5 effect variables", bty='l', ${opt1})
    add_text(d1$susie$rates$Threshold, yy$x, yy$y, 0.9, colors[1])
    add_text(d1$susie$rates$Threshold, yy$x, yy$y, 0.95, colors[1])
    # lines(1 - d2$susie$rates$Precision, d2$susie$rates$Recall, col=colors[2])
    yy = make_smooth(1 - d1$dap$rates$Precision, d1$dap$rates$Recall)
    lines(yy$x, yy$y, col=colors[3], ${opt1})
    add_text(d1$dap$rates$Threshold, yy$x, yy$y, 0.9, colors[3], -0.015)
    add_text(d1$dap$rates$Threshold, yy$x, yy$y, 0.95, colors[3], -0.015)
    # legend("bottomright", legend=c("SuSiE", "SuSiE (penalized)", "DAP-G"),
    legend("bottomright", legend=c("SuSiE", "DAP-G"),
       col=colors[c(1,3)], lty=c(1,1,1), cex=0.8)
    dev.off()
    system(paste0("convert -density 120 ", ${_output:nr}, '.part1.pdf', " ", ${_output:nr}, '.part1.png'))
    pdf("${_output:n}.part2.pdf", width=5, height=5, pointsize=15)
    yy = make_smooth(1 - d3$susie$rates$Precision, d3$susie$rates$Recall)
    plot(yy$x, yy$y, t="l", 
     col=colors[1], ylab = "power", xlab ="FDR", 
     main = "1 ~ 3 effect variables", bty='l', ${opt2})
    yy = make_smooth(1 - d3$dap$rates$Precision, d3$dap$rates$Recall)
    lines(yy$x, yy$y,col=colors[3], ${opt2})
    yy = make_smooth(1 - d3$caviar$rates$Precision, d3$caviar$rates$Recall)
    lines(yy$x, yy$y,col=colors[4], ${opt2})
    yy = make_smooth(1 - d3$finemap$rates$Precision, d3$finemap$rates$Recall)
    lines(yy$x, yy$y,col=colors[7], ${opt2})
    legend("bottomright", legend=c("SuSiE", "DAP-G", "CAVIAR", "FINEMAP"),
       col=c(colors[1], colors[3], colors[4], colors[7]), lty=c(1,1,1), cex=0.8)
    dev.off()
    system(paste0("convert -density 120 ", ${_output:nr}, '.part2.pdf', " ", ${_output:nr}, '.part2.png'))
  
bash: expand = True
  convert +append {_output:n}.part1.png {_output:n}.part2.png {_output:n}.png

Copyright © 2016-2020 Gao Wang et al at Stephens Lab, University of Chicago