Last active
April 6, 2024 15:52
-
-
Save quasiben/f7f3099b3b250101d15dd4b3a2fda26e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import cupy | |
import numpy as np | |
import xarray as xr | |
import dask.array as da | |
from dask.array import stats | |
import fsspec | |
n = 10000 # Number of variants (i.e. genomic locations) | |
m = 100000 # Number of individuals (i.e. people) | |
c = 3 # Number of covariates (i.e. confounders) | |
# Representative settings for single (small) UK Biobank chromosome: | |
# n, m, c = 141910, 365941, 25 | |
path = f"file://sim_ds_{n}_{m}_{c}.zarr" | |
path | |
fs = fsspec.filesystem('file') | |
if not fs.exists(path): | |
rs = da.random.RandomState(0) | |
XL, BL = rs.randint(0, 128, size=(n, m), chunks=(5216, 5792)), da.array([1] + [0] * (m - 1)) | |
XC, BC = rs.normal(size=(m, c)), rs.normal(size=(c,)) | |
Y = (XL * BL).sum(axis=0) + XC @ BC + rs.normal(scale=.001, size=m) | |
ds = xr.Dataset(dict( | |
# This is a proxy for discretized allele dosages (between 0 and 2) | |
XL=(('variants', 'samples'), (2 * XL / 127).astype('f2')), | |
# This value represents covariates for samples, e.g. age, sex, ancestry, etc. | |
XC=(('samples', 'covariates'), XC.astype('f4')), | |
# This is the outcome on which all variant data will be regressed separately | |
Y=(('samples', 'outcomes'), Y[:, np.newaxis].astype('f4')), | |
)) | |
print(f'Saving simulated data to {path}') | |
ds.to_zarr(fsspec.get_mapper(path), mode='w', consolidated=True) | |
ds = xr.open_zarr(fsspec.get_mapper(path), consolidated=True) | |
def gwas(XL, XC, Y): | |
# Add intercept | |
ones = da.ones_like(cupy.ones((XC.shape[0], 1))) | |
XC = da.concatenate([ones, XC], axis=1) | |
# Rechunk along short axes | |
XC = XC.rechunk((None, -1)) | |
Y = Y.rechunk((None, -1)) | |
dof = Y.shape[0] - XC.shape[1] - 1 | |
# Apply orthogonal projection to eliminate core covariates | |
XLP = XL - XC @ da.linalg.lstsq(XC, XL)[0] | |
YP = Y - XC @ da.linalg.lstsq(XC, Y)[0] | |
# Estimate coefficients for each loop covariate | |
XLPS = (XLP ** 2).sum(axis=0, keepdims=True).T | |
B = (XLP.T @ YP) / XLPS | |
# Compute residuals for each loop covariate and outcome separately | |
YR = YP[:, np.newaxis, :] - XLP[..., np.newaxis] * B[np.newaxis, ...] | |
RSS = (YR ** 2).sum(axis=0) | |
# Get t-statistics for coefficient estimates and match to p-values | |
T = B / np.sqrt(RSS / dof / XLPS) | |
T = T.map_blocks(cupy.asnumpy) | |
P = da.map_blocks( | |
lambda t: 2 * stats.distributions.t.sf(np.abs(t), dof), T, dtype="float64" | |
) | |
B = B.map_blocks(cupy.asnumpy) | |
return xr.Dataset(dict( | |
beta=(('variants','outcomes'), B), | |
pval=(('variants','outcomes'), P) | |
)) | |
# Define the GWAS regressions | |
dsr = gwas( | |
# Note: This (the largest) array needs to be rechunked due to scalability | |
# issues with da.matmul, specifically https://github.com/dask/dask/pull/6924. | |
# See here for more details: | |
# https://github.com/pystatgen/sgkit/issues/390#issuecomment-730660134 | |
ds.XL.data.rechunk((652, 5792)).T.astype('f4').map_blocks(cupy.asarray), | |
ds.XC.data.map_blocks(cupy.asarray), | |
ds.Y.data.map_blocks(cupy.asarray) | |
) | |
# Compute and save betas/p-values | |
output_path = f"file://sim_res-optimization-gpu_{n}_{m}_{c}.zarr" | |
#dsr.to_zarr(fsspec.get_mapper(output_path), mode='w', consolidated=True) | |
print(f'Results saved to {output_path}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment