ce-carey · March 22, 2022 16:45
diff --git a/README.md b/README.md
diff --git a/run_prs.py b/run_prs.py
 import hail as hl
 import pickle
 import time
 import argparse

 def flip_text(base):
    """
    :param StringExpression base: Expression of a single base
    :return: StringExpression of flipped base
    :rtype: StringExpression
    """
    return (hl.switch(base)
            .when('A', 'T')
            .when('T', 'A')
            .when('C', 'G')
            .when('G', 'C')
            .default(base))

 parser = argparse.ArgumentParser()
 parser.add_argument("--phenotype", help="name of the sumstat phenotype")
 args = parser.parse_args()

 ########################################################################    
 ### initialize    
 generate_sumstats_table = True
 sumstats_table_location = 'gs://ukbb_prs/sumstats/keytables/ukb-'+phenotype+'-ldpred-sumstats-locus-allele-keyed.kt'
 generate_contig_row_dict = True
 contig_row_dict_location = 'gs://ukbb_prs/sumstats/keytables/contig_row_dict-'+phenotype

 output_location = 'gs://ukbb_prs/prs/UKB_'+phenotype+'_PRS.txt'

 contigs = {'0{}'.format(x):str(x) for x in range(1, 10)}

 bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen'


 # large block size because we read very little data (due to filtering & ignoring genotypes)
 hl.init(branching_factor=10, min_block_size=2000)

 ########################################################################
 ### set up the sumstats table
 if (generate_sumstats_table):
    ss = hl.import_table('gs://ukbb_prs/sumstats/UKB_'+phenotype+'_LDPred.txt', delimiter='\s+', impute=True)
    ss = (ss.annotate(locus=hl.parse_locus(ss.chrom[6:] + ":" + hl.str(ss.pos)),
              alleles=[ss.nt1,ss.nt2])
      .key_by('locus'))
    ss.write(sumstats_table_location, overwrite=True)

 ss = hl.read_table(sumstats_table_location)
    
 ########################################################################
 ### determine the file locations of the prs variants
 if (generate_contig_row_dict):
    mt = hl.methods.import_bgen(bgen_files,
                [],
                contig_recoding=contigs,
                _row_fields=['file_row_idx'])
    prs_rows = mt.filter_rows(hl.is_defined(ss[mt.locus])).rows()
    print('about to collect')
    # remove all unnecessary data, dropping keys and other irrelevant fields
    prs_rows = prs_rows.key_by()
    prs_rows = prs_rows.select(prs_rows.locus.contig, prs_rows.file_row_idx)
    contig_row_list = prs_rows.collect()
    print('finished collecting')
    contig_reformed = [(x['contig'], x['file_row_idx']) for x in contig_row_list]
    print('reformed')
    from collections import defaultdict
    contig_row_dict = defaultdict(list)
    for k, v in contig_reformed:
        contig_row_dict[k].append(v)
    print('dictionary created')

    with hl.hadoop_open(contig_row_dict_location, 'wb') as f:
        pickle.dump(contig_row_dict, f)
 else:
    with hl.hadoop_open(contig_row_dict_location, 'rb') as f:
        contig_row_dict = pickle.load(f)

 ########################################################################
 ### Run the PRS
 contig_row_dict2 = {'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{contig}_v3.bgen'.format(contig=k): v for k, v in contig_row_dict.items()}
 mt = hl.methods.import_bgen(bgen_files,
                ['dosage'],
                sample_file='gs://phenotype_31063/ukb31063.autosomes.sample',
                contig_recoding=contigs,
                _variants_per_file=contig_row_dict2,
                _row_fields=[])

 sampleids = hl.import_table('gs://ukb31063-mega-gwas/qc/ukb31063.gwas_samples.txt', delimiter='\s+').key_by('s')

 mt = mt.filter_cols(hl.is_defined(sampleids[mt.s]))
 mt = mt.annotate_rows(ss=ss[mt.locus])

 mt = mt.annotate_rows(
    beta = (hl.case()
        .when(((mt.alleles[0] == mt.ss.nt1) &
           (mt.alleles[1] == mt.ss.nt2)) |
          ((flip_text(mt.alleles[0]) == mt.ss.nt1) &
           (flip_text(mt.alleles[1]) == mt.ss.nt2)),
              (-1*mt.ss.ldpred_inf_beta))
        .when(((mt.alleles[0] == mt.ss.nt2) &
               (mt.alleles[1] == mt.ss.nt1)) |
              ((flip_text(mt.alleles[0]) == mt.ss.nt2) &
               (flip_text(mt.alleles[1]) == mt.ss.nt1)),
              mt.ss.ldpred_inf_beta)
        .or_missing()))

 mt = mt.filter_rows(hl.is_defined(mt.beta))

 mt = mt.annotate_cols(prs=hl.agg.sum(mt.beta * mt.dosage))

 mt.cols().export(output_location)
	import hail as hl
	import pickle
	import time
	import argparse

	def flip_text(base):
	"""
	:param StringExpression base: Expression of a single base
	:return: StringExpression of flipped base
	:rtype: StringExpression
	"""
	return (hl.switch(base)
	.when('A', 'T')
	.when('T', 'A')
	.when('C', 'G')
	.when('G', 'C')
	.default(base))

	parser = argparse.ArgumentParser()
	parser.add_argument("--phenotype", help="name of the sumstat phenotype")
	args = parser.parse_args()

	########################################################################
	### initialize
	generate_sumstats_table = True
	sumstats_table_location = 'gs://ukbb_prs/sumstats/keytables/ukb-'+phenotype+'-ldpred-sumstats-locus-allele-keyed.kt'
	generate_contig_row_dict = True
	contig_row_dict_location = 'gs://ukbb_prs/sumstats/keytables/contig_row_dict-'+phenotype

	output_location = 'gs://ukbb_prs/prs/UKB_'+phenotype+'_PRS.txt'

	contigs = {'0{}'.format(x):str(x) for x in range(1, 10)}

	bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen'


	# large block size because we read very little data (due to filtering & ignoring genotypes)
	hl.init(branching_factor=10, min_block_size=2000)

	########################################################################
	### set up the sumstats table
	if (generate_sumstats_table):
	ss = hl.import_table('gs://ukbb_prs/sumstats/UKB_'+phenotype+'_LDPred.txt', delimiter='\s+', impute=True)
	ss = (ss.annotate(locus=hl.parse_locus(ss.chrom[6:] + ":" + hl.str(ss.pos)),
	alleles=[ss.nt1,ss.nt2])
	.key_by('locus'))
	ss.write(sumstats_table_location, overwrite=True)

	ss = hl.read_table(sumstats_table_location)

	########################################################################
	### determine the file locations of the prs variants
	if (generate_contig_row_dict):
	mt = hl.methods.import_bgen(bgen_files,
	[],
	contig_recoding=contigs,
	_row_fields=['file_row_idx'])
	prs_rows = mt.filter_rows(hl.is_defined(ss[mt.locus])).rows()
	print('about to collect')
	# remove all unnecessary data, dropping keys and other irrelevant fields
	prs_rows = prs_rows.key_by()
	prs_rows = prs_rows.select(prs_rows.locus.contig, prs_rows.file_row_idx)
	contig_row_list = prs_rows.collect()
	print('finished collecting')
	contig_reformed = [(x['contig'], x['file_row_idx']) for x in contig_row_list]
	print('reformed')
	from collections import defaultdict
	contig_row_dict = defaultdict(list)
	for k, v in contig_reformed:
	contig_row_dict[k].append(v)
	print('dictionary created')

	with hl.hadoop_open(contig_row_dict_location, 'wb') as f:
	pickle.dump(contig_row_dict, f)
	else:
	with hl.hadoop_open(contig_row_dict_location, 'rb') as f:
	contig_row_dict = pickle.load(f)

	########################################################################
	### Run the PRS
	contig_row_dict2 = {'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{contig}_v3.bgen'.format(contig=k): v for k, v in contig_row_dict.items()}
	mt = hl.methods.import_bgen(bgen_files,
	['dosage'],
	sample_file='gs://phenotype_31063/ukb31063.autosomes.sample',
	contig_recoding=contigs,
	_variants_per_file=contig_row_dict2,
	_row_fields=[])

	sampleids = hl.import_table('gs://ukb31063-mega-gwas/qc/ukb31063.gwas_samples.txt', delimiter='\s+').key_by('s')

	mt = mt.filter_cols(hl.is_defined(sampleids[mt.s]))
	mt = mt.annotate_rows(ss=ss[mt.locus])

	mt = mt.annotate_rows(
	beta = (hl.case()
	.when(((mt.alleles[0] == mt.ss.nt1) &
	(mt.alleles[1] == mt.ss.nt2)) \|
	((flip_text(mt.alleles[0]) == mt.ss.nt1) &
	(flip_text(mt.alleles[1]) == mt.ss.nt2)),
	(-1*mt.ss.ldpred_inf_beta))
	.when(((mt.alleles[0] == mt.ss.nt2) &
	(mt.alleles[1] == mt.ss.nt1)) \|
	((flip_text(mt.alleles[0]) == mt.ss.nt2) &
	(flip_text(mt.alleles[1]) == mt.ss.nt1)),
	mt.ss.ldpred_inf_beta)
	.or_missing()))

	mt = mt.filter_rows(hl.is_defined(mt.beta))

	mt = mt.annotate_cols(prs=hl.agg.sum(mt.beta * mt.dosage))

	mt.cols().export(output_location)