Last active
July 13, 2021 18:59
-
-
Save ce-carey/6480d6544f132829d9579b2a1f1455b4 to your computer and use it in GitHub Desktop.
UKB Phenotypic Correlation Matrices
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## UKB Phenotypic Correlation Matrices | |
## | |
## Generates a phenotypic correlation matrix for all GWAS'ed | |
## traits, residualized for all GWAS covariates. Also outputs | |
## a matrix of pairwise N's. | |
## | |
## Created by Caitlin E Carey ([email protected]) | |
## | |
## Last updated 9/27/2019 | |
import pandas as pd | |
import numpy as np | |
import statsmodels.formula.api as sm | |
import functools | |
import sys | |
# flush the buffer to std.out | |
print = functools.partial(print,flush=True) | |
# grab commandline input specifying sex to be analyzed | |
biosex=sys.argv[1] | |
# load the most current manifest file | |
manifest = pd.read_table('/stanley/robinson/ccarey/UKBB/ukb_files/UKBBmegaGWAS_manifest_downloaded20190910.tsv') | |
# grab all non-covar GWAS phenotypes in manifest | |
phenolist = [x for x in manifest.loc[manifest.Sex==biosex,"Phenotype Code"].values.tolist() if pd.notnull(x) and x not in ["age","sex"]] | |
phenolist = np.unique(phenolist) | |
# grab all participants in GWAS sample remove those who subsequently withdrew | |
sampleIDs = [line.rstrip('\n') for line in open('/psych/genetics_data/projects/ukb31063/ukb31063.neale_gwas_samples.'+biosex+'.txt')][1:] | |
withdrawn = [line.rstrip('\n') for line in open('/psych/genetics_data/projects/ukb31063/ukb31063.withdrawn_samples.txt')][1:] | |
sampleIDs = [int(x) for x in sampleIDs if x not in withdrawn] | |
# load all GWAS covariates | |
covars = pd.read_table('/stanley/robinson/ccarey/UKBB/ukb_files/megaGWAS_covars/ukb31063.neale_gwas_covariates.'+biosex+'.tsv.bgz',index_col="s",compression="gzip") | |
# load all relevant phenotype files | |
phesant = pd.read_table('/psych/genetics_data/projects/ukb31063/ukb31063.PHESANT_January_2019.'+biosex+'.tsv.bgz',dtype=object,index_col="s",compression="gzip") | |
finngen = pd.read_table('/psych/genetics_data/projects/ukb31063/ukb31063.FINNGEN_phenotypes.'+biosex+'.tsv.bgz',index_col="s",compression="gzip") | |
icd10 = pd.read_table('/psych/genetics_data/projects/ukb31063/ukb31063.ICD10_phenotypes.'+biosex+'.tsv.bgz',index_col="s",compression="gzip") | |
biomarkers = pd.read_csv('/psych/genetics_data/ccarey/UKBB/ukb_files/biomarkers_megaGWAS/ukb31063.biomarkers_gwas.'+biosex+'.tsv',dtype=object,index_col="s") | |
# grab all GWAS'ed phenotypes | |
phesant_phenos = np.intersect1d(phesant.columns,phenolist) | |
finngen_phenos = np.intersect1d(finngen.columns,phenolist) | |
icd10_phenos = np.intersect1d(icd10.columns,phenolist) | |
biomarkers_phenos = np.intersect1d(biomarkers.columns,phenolist) | |
filephenos = np.concatenate((phesant_phenos,finngen_phenos,icd10_phenos,biomarkers_phenos)) | |
# extract data for all GWAS'ed phenotypes | |
phesant_extracted = phesant.loc[sampleIDs,phesant_phenos] | |
finngen_extracted = finngen.loc[sampleIDs,finngen_phenos] | |
icd10_extracted = icd10.loc[sampleIDs,icd10_phenos] | |
biomarkers_extracted = biomarkers.loc[sampleIDs,biomarkers_phenos] | |
manifest_phenos = pd.concat([phesant_extracted,finngen_extracted,icd10_extracted,biomarkers_extracted],axis=1) | |
# replace boolean strings with actual booleans | |
manifest_phenos.replace('TRUE',True,inplace=True) | |
manifest_phenos.replace('FALSE',False,inplace=True) | |
# function to residualize GWAS phenotypes by GWAS covars | |
def residualize(column): | |
temp = manifest_phenos[column].astype(float).to_frame().join(covars,how="left") | |
temp['y'] = temp[column] | |
model = sm.ols("y ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20 + age + age_isFemale + age_squared + age_squared_isFemale + isFemale", data=temp,missing="drop").fit() | |
return(model.resid) | |
# residualize all phenotypes | |
manifest_phenos_resid = manifest_phenos.apply(lambda x: residualize(x.name)) | |
# add in GWAS'ed covariates | |
if biosex=="both_sexes": | |
isfemale_mod = sm.ols("isFemale.astype(int) ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20 + age + age_squared", data=covars,missing="drop").fit() | |
manifest_phenos_resid["isFemale"]=isfemale_mod.resid | |
age_mod = sm.ols("age ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20 + isFemale", data=covars,missing="drop").fit() | |
manifest_phenos_resid["age"]=age_mod.resid | |
else: | |
age_mod = sm.ols("age ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20", data=covars,missing="drop").fit() | |
manifest_phenos_resid["age"]=age_mod.resid | |
# generate residualized phenotypic correlation matrix | |
manifest_phenos_resid_corr = manifest_phenos_resid.corr() | |
print("correlation matrix generated") | |
# save resulting correlation matrix | |
manifest_phenos_resid_corr.to_csv('/psych/genetics_data/ccarey/UKBB/h2_corrmat/corrmat/'+biosex+'_resid_corrmat.csv') | |
print("saved corrmat to file") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment