Multivariate Bayesian variable selection regression

Computing vectorized OLS

Implementing univariate, simple OLS for multiple Y and multiple X without using loop. But uses Einstein summation in Python.

The idea is to vectorize the calculation as much as possible. For OLS of $N$ samples looping over $J$ effects and $R$ conditions:

$$\hat{\beta}_{jr} = (x_j^Tx_j)^{-1}x_j^Ty_r$$$$s(\hat{\beta}_{jr}) = \frac{(x_j^Tx_j)^{-1} (y_r - x_j\beta_r)^T(y_r - x_j\beta_r)}{N-2}$$

The computation can be vectorized, because:

  • $x_j^Ty_r$ is in fact each element of matrix $X^TY_{J\times R}$, which can be computed up-front
  • The loop over $j$ for $x_j^Tx_j$ can be replaced by Einstein summation notation in numpy
  • $(y_r - x_j\beta_r)$ is a $N$ vector; the loop over $r$ can again be replaced by Einstein summation.
  • The above calculation will have to be looped over $j$, which, via Einstein summation will be expanded to a 3D array without the need to loop.

Implementation

In [13]:
import numpy as np
N = 10
R = 2
J = 3
X = np.random.rand(N, J)
Y = np.random.rand(N, R)

Expected results

In [14]:
from scipy.stats import linregress
from sklearn.linear_model import LinearRegression

def univariate_simple_regression(X, y, Z=None):
    if Z is not None:
        model = LinearRegression()
        model.fit(Z, y)
        y = y - model.predict(Z)
    return np.vstack([linregress(x, y) for x in X.T])[:,[0,1,4]]

def get_summary_stats(X,Y):
    B = np.zeros((X.shape[1], Y.shape[1]))
    S = np.zeros((X.shape[1], Y.shape[1]))
    for r, y in enumerate(Y.T):
        B[:,r], S[:,r] = univariate_simple_regression(X, y)[:,[0,2]].T
    return B, S

res = get_summary_stats(X, Y)

Compute $\hat{\beta}$

In [3]:
X -= np.mean(X, axis=0, keepdims=True)
Y -= np.mean(Y, axis=0, keepdims=True)
XtY = X.T @ Y
XtX_vec = np.einsum('ji,ji->i', X, X)
Bhat = XtY / XtX_vec[:,np.newaxis]
In [4]:
Bhat
Out[4]:
array([[ 0.26552071, -0.46715414],
       [-0.24884064, -0.10294156],
       [ 0.0422749 , -0.05770318]])
In [5]:
res[0]
Out[5]:
array([[ 0.26552071, -0.46715414],
       [-0.24884064, -0.10294156],
       [ 0.0422749 , -0.05770318]])

Compute $s(\hat{\beta})$

In [9]:
Xr = Y - np.einsum('ij,jk->jik', X, Bhat)
Re = np.einsum('ijk,ijk->ik', Xr, Xr)
S = np.sqrt(Re / XtX_vec[:,np.newaxis] / (X.shape[0] - 2))
In [10]:
S
Out[10]:
array([[ 0.48999972,  0.41774897],
       [ 0.29072673,  0.27105792],
       [ 0.39868505,  0.35864399]])
In [11]:
res[1]
Out[11]:
array([[ 0.48999972,  0.41774897],
       [ 0.29072673,  0.27105792],
       [ 0.39868505,  0.35864399]])

Putting all together

In [17]:
from sklearn.linear_model import LinearRegression

def get_summary_stats(X, Y, Z=None):
    if Z is not None:
        model = LinearRegression()
        for j in Y.shape[1]:
            model.fit(Z, Y[:,j])
            Y[:,j] = Y[:,j] - model.predict(Z)
    # Compute Bhat
    X -= np.mean(X, axis=0, keepdims=True)
    Y -= np.mean(Y, axis=0, keepdims=True)
    XtY = X.T @ Y
    XtX_vec = np.einsum('ji,ji->i', X, X)
    Bhat = XtY / XtX_vec[:,np.newaxis]
    Xr = Y - np.einsum('ij,jk->jik', X, Bhat)
    Re = np.einsum('ijk,ijk->ik', Xr, Xr)
    S = np.sqrt(Re / XtX_vec[:,np.newaxis] / (X.shape[0] - 2))
    return Bhat, S

Remove covariate

In [23]:
N = 10
R = 2
J = 3
Z = np.random.rand(N, J)
Y = np.random.rand(N, R)
In [24]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
res = np.zeros(Y.shape)
for j in range(Y.shape[1]):
    model.fit(Z, Y[:,j])
    res[:,j] = Y[:,j] - model.predict(Z)
In [25]:
res
Out[25]:
array([[ 0.33024609,  0.28229446],
       [ 0.1279729 , -0.01895704],
       [-0.29096583, -0.41889803],
       [ 0.14170275,  0.08245545],
       [ 0.02337857, -0.37046396],
       [-0.25297135,  0.01756612],
       [-0.17662396,  0.19416819],
       [ 0.04944676,  0.34255115],
       [ 0.21680997, -0.18438358],
       [-0.16899589,  0.07366723]])
In [29]:
Y -= np.mean(Y, axis=0, keepdims=True)
Z -= np.mean(Z, axis=0, keepdims=True)
Y - Z @ (np.linalg.inv(Z.T @ Z) @ Z.T @ Y)
Out[29]:
array([[ 0.33024609,  0.28229446],
       [ 0.1279729 , -0.01895704],
       [-0.29096583, -0.41889803],
       [ 0.14170275,  0.08245545],
       [ 0.02337857, -0.37046396],
       [-0.25297135,  0.01756612],
       [-0.17662396,  0.19416819],
       [ 0.04944676,  0.34255115],
       [ 0.21680997, -0.18438358],
       [-0.16899589,  0.07366723]])
In [27]:
Z
Out[27]:
array([[-0.18256478, -0.26087364,  0.39878093],
       [-0.1790025 ,  0.30161771, -0.50320066],
       [ 0.06599051, -0.20726922,  0.37074187],
       [ 0.11979348,  0.19780406, -0.52429572],
       [ 0.00290959,  0.23833352,  0.11410838],
       [-0.07929431, -0.33833572, -0.46023757],
       [ 0.23755258,  0.14376681, -0.0072283 ],
       [ 0.2278818 , -0.07632593,  0.35800175],
       [ 0.15537815, -0.26583764,  0.05081791],
       [-0.36864452,  0.26712006,  0.2025114 ]])
In [28]:
Y
Out[28]:
array([[ 0.44137823,  0.27394396],
       [ 0.11838591, -0.11779373],
       [-0.28199186, -0.36668192],
       [ 0.16912463,  0.08251116],
       [-0.26735562, -0.4059477 ],
       [ 0.32251763,  0.04559095],
       [-0.37493192,  0.23784026],
       [-0.10546356,  0.41939242],
       [ 0.41515563, -0.09988743],
       [-0.43681908, -0.06896796]])
In [ ]:
from scipy.stats import linregress
from sklearn.linear_model import LinearRegression
class RegressionData:
    def __init__(self, X = None, Y = None, Z = None, B = None, S = None):
        self.X = X
        self.Y = Y
        self.Z = Z
        self.B = B
        self.S = S
        self.lik = None
        if (self.X is not None and self.Y is not None) and (self.B is None and self.S is None):
            self.get_summary_stats()

    def get_summary_stats(self):
        '''
        perform univariate regression
        FIXME: it is slower than lapply + .lm.fit in R
        FIXME: this faster implementation is on my watch list:
        https://github.com/ajferraro/fastreg
        '''
        self.B = np.zeros((self.X.shape[1], self.Y.shape[1]))
        self.S = np.zeros((self.X.shape[1], self.Y.shape[1]))
        for r, y in enumerate(self.Y.T):
            self.B[:,r], self.S[:,r] = self.univariate_simple_regression(self.X, y)[:,[0,2]].T
        
    @staticmethod
    def univariate_simple_regression(X, y, Z=None):
        if Z is not None:
            model = LinearRegression()
            model.fit(Z, y)
            y = y - model.predict(Z)
        return np.vstack([linregress(x, y) for x in X.T])[:,[0,1,4]]

    def __str__(self):
        l = dir(self)
        d = self.__dict__
        from pprint import pformat
        return pformat(d, indent = 4)

Copyright © 2016-2020 Gao Wang et al at Stephens Lab, University of Chicago