Here I take a gene from 2 tissues, with covariates.
The genotypes matrix should have complete data after imputation. However not all samples overlap in both tissues, making expression matrix. For the time being I'll make 2 versions: one version with missing expression matrix as is, the other version fills in the missing data using mean imputation (use mean for expression in the tissue in question).
The 2 tissues of choice are lung and thyroid (i do not trust whole blood).
The gene I take is FMO2 (ENSG00000094963 on chr1). Separate data-sets for Lung and Thyroid are extracted using this notebook. The data are:
~/Documents/GTExV8/Thyroid.FMO2.pm1Mb.RDS
~/Documents/GTExV8/Lung.FMO2.pm1Mb.RDS
Also I remove covariates beforhand -- it is difficult to consider covariates for multiple Y because each Y may have a different set of covariates. Removing them beforehand might be the only way to go.
dat1 = readRDS('~/Documents/GTExV8/Thyroid.FMO2.pm1Mb.RDS')
dat2 = readRDS('~/Documents/GTExV8/Lung.FMO2.pm1Mb.RDS')
str(dat1$X)
str(dat2$X)
str(dat1$y)
str(dat2$y)
str(dat1$Z)
str(dat2$Z)
all_samples = union(rownames(dat1$X), rownames(dat2$X))
print(length(all_samples))
dat1$y = .lm.fit(dat1$Z, dat1$y)$residuals
dat2$y = .lm.fit(dat2$Z, dat2$y)$residuals
y1 = data.frame(matrix(dat1$y, length(dat1$y), 1))
y2 = data.frame(matrix(dat2$y, length(dat2$y), 1))
rownames(y1) = rownames(dat1$X)
rownames(y2) = rownames(dat2$X)
colnames(y1) = 'Thyroid'
colnames(y2) = 'Lung'
y1$rows = rownames(y1)
y2$rows = rownames(y2)
y <-merge(y1,y2, by='rows', all=T)
rownames(y) = y$rows
y = y[order(rownames(y)),-1]
head(y)
sum(is.na(y))
X2 = subset(dat2$X, !(rownames(dat2$X) %in% rownames(dat1$X)))
X = rbind(dat1$X, X2)
X = X[order(rownames(X)),]
dim(X)
y_filled = y
for(i in 1:ncol(y)){
y_filled[is.na(y[,i]), i] <- mean(y[,i], na.rm = TRUE)
}
saveRDS(list(Y=y_filled,X=X), '~/Documents/GTExV8/Thyroid.Lung.FMO2.filled.rds')
saveRDS(list(Y=y,X=X), '~/Documents/GTExV8/Thyroid.Lung.FMO2.rds')
autoselect.mixsd = function(betahat,sebetahat,mult = sqrt(2)){
# To avoid exact measure causing (usually by mistake)
sebetahat = sebetahat[sebetahat!=0]
# so that the minimum is small compared with measurement precision
sigmaamin = min(sebetahat)/10
if (all(betahat^2 <= sebetahat^2)) {
# to deal with the occassional odd case where this could happen; 8 is arbitrary
sigmaamax = 8*sigmaamin
} else {
# this computes a rough largest value you'd want to use,
# based on idea that sigmaamax^2 + sebetahat^2 should be at least betahat^2
sigmaamax = 2*sqrt(max(betahat^2-sebetahat^2))
}
if(mult==0){
return(c(0,sigmaamax/2))
} else {
npoint = ceiling(log2(sigmaamax/sigmaamin)/log2(mult))
return(mult^((-npoint):0) * sigmaamax)
}
}
univariate_regression = function(X, y, Z = NULL){
if (!is.null(Z)) {
y = .lm.fit(Z, y)$residuals
}
calc_stderr = function(X, residuals) { sqrt(diag(sum(residuals^2) / (nrow(X) - 2) * chol2inv(chol(t(X) %*% X)))) }
output = do.call(rbind,
lapply(c(1:ncol(X)), function(i) {
g = .lm.fit(cbind(1, X[,i]), y)
return(c(coef(g)[2], calc_stderr(cbind(1, X[,i]), g$residuals)[2]))
})
)
return(list(betahat = output[,1], sebetahat = output[,2],
residuals = y))
}
lasso_reorder = function(X, y) {
# perform lasso regression and reorder regressors by "importance"
fit.glmnet <- glmnet::glmnet(X, y, intercept = F)
beta_path = coef(fit.glmnet)[-1,]
K = dim(beta_path)[2]
path_order = c()
for (k in 1:K) {
crt_path = which(beta_path[,k] != 0)
if (length(crt_path) != 0 & length(path_order) == 0) {
path_order = c(path_order, crt_path)
} else if(length(crt_path) != 0) {
path_order = c(path_order, crt_path[-which(crt_path %in% path_order)] )
}
}
path_order = unname(path_order)
index_order = c(path_order, seq(1,dim(beta_path)[1])[-path_order])
return(index_order)
}
X = X[,which(colSums(X)!=0)]
initial = univariate_regression(X, y_filled$Thyroid)
mixsd = autoselect.mixsd(initial$betahat, initial$sebetahat)
mu_zero = matrix(0, ncol = length(mixsd)+1, nrow = ncol(X))
alpha_zero = matrix(1/ncol(X), ncol = length(mixsd)+1,nrow = ncol(X))
alpha_zero[,1] = 1 - length(mixsd) / ncol(X)
index_order = lasso_reorder(X, initial$residuals)
logdata = capture.output({ fit = varbvs::varbvsmix(X[, index_order], NULL, y_filled$Thyroid,
sa = c(0,mixsd^2),
mu = mu_zero,
alpha = alpha_zero,
verbose = F) })
betahat = rowSums(fit$alpha * fit$mu)
names(betahat) = colnames(X)
mr_ash_out = list(betahat = betahat, fit = fit)
sort(fit$lfsr)[1:10]
sort(betahat, decreasing = T)[1:10]
lasso_res = glmnet::glmnet(X, y_filled$Thyroid, intercept = F)
plot(coef(lasso_res, s = 0.1))
initial = univariate_regression(X, y_filled$Lung)
mixsd = autoselect.mixsd(initial$betahat, initial$sebetahat)
mu_zero = matrix(0, ncol = length(mixsd)+1, nrow = ncol(X))
alpha_zero = matrix(1/ncol(X), ncol = length(mixsd)+1,nrow = ncol(X))
alpha_zero[,1] = 1 - length(mixsd) / ncol(X)
index_order = lasso_reorder(X, initial$residuals)
logdata = capture.output({ fit = varbvs::varbvsmix(X[, index_order], NULL, y_filled$Thyroid,
sa = c(0,mixsd^2),
mu = mu_zero,
alpha = alpha_zero,
verbose = F) })
betahat = rowSums(fit$alpha * fit$mu)
names(betahat) = colnames(X)
mr_ash_out = list(betahat = betahat, fit = fit)
sort(fit$lfsr)[1:10]
sort(betahat, decreasing = T)[1:10]
lasso_res = glmnet::glmnet(X, y_filled$Lung, intercept = F)
plot(coef(lasso_res, s = 0.05))