Implementing univariate, simple OLS for multiple Y and multiple X without using loop. But uses Einstein summation in Python.
The idea is to vectorize the calculation as much as possible. For OLS of $N$ samples looping over $J$ effects and $R$ conditions:
$$\hat{\beta}_{jr} = (x_j^Tx_j)^{-1}x_j^Ty_r$$$$s(\hat{\beta}_{jr}) = \frac{(x_j^Tx_j)^{-1} (y_r - x_j\beta_r)^T(y_r - x_j\beta_r)}{N-2}$$The computation can be vectorized, because:
numpy
import numpy as np
N = 10
R = 2
J = 3
X = np.random.rand(N, J)
Y = np.random.rand(N, R)
from scipy.stats import linregress
from sklearn.linear_model import LinearRegression
def univariate_simple_regression(X, y, Z=None):
if Z is not None:
model = LinearRegression()
model.fit(Z, y)
y = y - model.predict(Z)
return np.vstack([linregress(x, y) for x in X.T])[:,[0,1,4]]
def get_summary_stats(X,Y):
B = np.zeros((X.shape[1], Y.shape[1]))
S = np.zeros((X.shape[1], Y.shape[1]))
for r, y in enumerate(Y.T):
B[:,r], S[:,r] = univariate_simple_regression(X, y)[:,[0,2]].T
return B, S
res = get_summary_stats(X, Y)
X -= np.mean(X, axis=0, keepdims=True)
Y -= np.mean(Y, axis=0, keepdims=True)
XtY = X.T @ Y
XtX_vec = np.einsum('ji,ji->i', X, X)
Bhat = XtY / XtX_vec[:,np.newaxis]
Bhat
res[0]
Xr = Y - np.einsum('ij,jk->jik', X, Bhat)
Re = np.einsum('ijk,ijk->ik', Xr, Xr)
S = np.sqrt(Re / XtX_vec[:,np.newaxis] / (X.shape[0] - 2))
S
res[1]
from sklearn.linear_model import LinearRegression
def get_summary_stats(X, Y, Z=None):
if Z is not None:
model = LinearRegression()
for j in Y.shape[1]:
model.fit(Z, Y[:,j])
Y[:,j] = Y[:,j] - model.predict(Z)
# Compute Bhat
X -= np.mean(X, axis=0, keepdims=True)
Y -= np.mean(Y, axis=0, keepdims=True)
XtY = X.T @ Y
XtX_vec = np.einsum('ji,ji->i', X, X)
Bhat = XtY / XtX_vec[:,np.newaxis]
Xr = Y - np.einsum('ij,jk->jik', X, Bhat)
Re = np.einsum('ijk,ijk->ik', Xr, Xr)
S = np.sqrt(Re / XtX_vec[:,np.newaxis] / (X.shape[0] - 2))
return Bhat, S
N = 10
R = 2
J = 3
Z = np.random.rand(N, J)
Y = np.random.rand(N, R)
from sklearn.linear_model import LinearRegression
model = LinearRegression()
res = np.zeros(Y.shape)
for j in range(Y.shape[1]):
model.fit(Z, Y[:,j])
res[:,j] = Y[:,j] - model.predict(Z)
res
Y -= np.mean(Y, axis=0, keepdims=True)
Z -= np.mean(Z, axis=0, keepdims=True)
Y - Z @ (np.linalg.inv(Z.T @ Z) @ Z.T @ Y)
Z
Y
from scipy.stats import linregress
from sklearn.linear_model import LinearRegression
class RegressionData:
def __init__(self, X = None, Y = None, Z = None, B = None, S = None):
self.X = X
self.Y = Y
self.Z = Z
self.B = B
self.S = S
self.lik = None
if (self.X is not None and self.Y is not None) and (self.B is None and self.S is None):
self.get_summary_stats()
def get_summary_stats(self):
'''
perform univariate regression
FIXME: it is slower than lapply + .lm.fit in R
FIXME: this faster implementation is on my watch list:
https://github.com/ajferraro/fastreg
'''
self.B = np.zeros((self.X.shape[1], self.Y.shape[1]))
self.S = np.zeros((self.X.shape[1], self.Y.shape[1]))
for r, y in enumerate(self.Y.T):
self.B[:,r], self.S[:,r] = self.univariate_simple_regression(self.X, y)[:,[0,2]].T
@staticmethod
def univariate_simple_regression(X, y, Z=None):
if Z is not None:
model = LinearRegression()
model.fit(Z, y)
y = y - model.predict(Z)
return np.vstack([linregress(x, y) for x in X.T])[:,[0,1,4]]
def __str__(self):
l = dir(self)
d = self.__dict__
from pprint import pformat
return pformat(d, indent = 4)