|
import hail as hl |
|
import pickle |
|
import time |
|
import argparse |
|
|
|
def flip_text(base): |
|
""" |
|
:param StringExpression base: Expression of a single base |
|
:return: StringExpression of flipped base |
|
:rtype: StringExpression |
|
""" |
|
return (hl.switch(base) |
|
.when('A', 'T') |
|
.when('T', 'A') |
|
.when('C', 'G') |
|
.when('G', 'C') |
|
.default(base)) |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--phenotype", help="name of the sumstat phenotype") |
|
args = parser.parse_args() |
|
|
|
######################################################################## |
|
### initialize |
|
generate_sumstats_table = True |
|
sumstats_table_location = 'gs://ukbb_prs/sumstats/keytables/ukb-'+phenotype+'-ldpred-sumstats-locus-allele-keyed.kt' |
|
generate_contig_row_dict = True |
|
contig_row_dict_location = 'gs://ukbb_prs/sumstats/keytables/contig_row_dict-'+phenotype |
|
|
|
output_location = 'gs://ukbb_prs/prs/UKB_'+phenotype+'_PRS.txt' |
|
|
|
contigs = {'0{}'.format(x):str(x) for x in range(1, 10)} |
|
|
|
bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen' |
|
|
|
|
|
# large block size because we read very little data (due to filtering & ignoring genotypes) |
|
hl.init(branching_factor=10, min_block_size=2000) |
|
|
|
######################################################################## |
|
### set up the sumstats table |
|
if (generate_sumstats_table): |
|
ss = hl.import_table('gs://ukbb_prs/sumstats/UKB_'+phenotype+'_LDPred.txt', delimiter='\s+', impute=True) |
|
ss = (ss.annotate(locus=hl.parse_locus(ss.chrom[6:] + ":" + hl.str(ss.pos)), |
|
alleles=[ss.nt1,ss.nt2]) |
|
.key_by('locus')) |
|
ss.write(sumstats_table_location, overwrite=True) |
|
|
|
ss = hl.read_table(sumstats_table_location) |
|
|
|
######################################################################## |
|
### determine the file locations of the prs variants |
|
if (generate_contig_row_dict): |
|
mt = hl.methods.import_bgen(bgen_files, |
|
[], |
|
contig_recoding=contigs, |
|
_row_fields=['file_row_idx']) |
|
prs_rows = mt.filter_rows(hl.is_defined(ss[mt.locus])).rows() |
|
print('about to collect') |
|
# remove all unnecessary data, dropping keys and other irrelevant fields |
|
prs_rows = prs_rows.key_by() |
|
prs_rows = prs_rows.select(prs_rows.locus.contig, prs_rows.file_row_idx) |
|
contig_row_list = prs_rows.collect() |
|
print('finished collecting') |
|
contig_reformed = [(x['contig'], x['file_row_idx']) for x in contig_row_list] |
|
print('reformed') |
|
from collections import defaultdict |
|
contig_row_dict = defaultdict(list) |
|
for k, v in contig_reformed: |
|
contig_row_dict[k].append(v) |
|
print('dictionary created') |
|
|
|
with hl.hadoop_open(contig_row_dict_location, 'wb') as f: |
|
pickle.dump(contig_row_dict, f) |
|
else: |
|
with hl.hadoop_open(contig_row_dict_location, 'rb') as f: |
|
contig_row_dict = pickle.load(f) |
|
|
|
######################################################################## |
|
### Run the PRS |
|
contig_row_dict2 = {'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{contig}_v3.bgen'.format(contig=k): v for k, v in contig_row_dict.items()} |
|
mt = hl.methods.import_bgen(bgen_files, |
|
['dosage'], |
|
sample_file='gs://phenotype_31063/ukb31063.autosomes.sample', |
|
contig_recoding=contigs, |
|
_variants_per_file=contig_row_dict2, |
|
_row_fields=[]) |
|
|
|
sampleids = hl.import_table('gs://ukb31063-mega-gwas/qc/ukb31063.gwas_samples.txt', delimiter='\s+').key_by('s') |
|
|
|
mt = mt.filter_cols(hl.is_defined(sampleids[mt.s])) |
|
mt = mt.annotate_rows(ss=ss[mt.locus]) |
|
|
|
mt = mt.annotate_rows( |
|
beta = (hl.case() |
|
.when(((mt.alleles[0] == mt.ss.nt1) & |
|
(mt.alleles[1] == mt.ss.nt2)) | |
|
((flip_text(mt.alleles[0]) == mt.ss.nt1) & |
|
(flip_text(mt.alleles[1]) == mt.ss.nt2)), |
|
(-1*mt.ss.ldpred_inf_beta)) |
|
.when(((mt.alleles[0] == mt.ss.nt2) & |
|
(mt.alleles[1] == mt.ss.nt1)) | |
|
((flip_text(mt.alleles[0]) == mt.ss.nt2) & |
|
(flip_text(mt.alleles[1]) == mt.ss.nt1)), |
|
mt.ss.ldpred_inf_beta) |
|
.or_missing())) |
|
|
|
mt = mt.filter_rows(hl.is_defined(mt.beta)) |
|
|
|
mt = mt.annotate_cols(prs=hl.agg.sum(mt.beta * mt.dosage)) |
|
|
|
mt.cols().export(output_location) |