Last active
July 17, 2024 17:17
-
-
Save PoisonAlien/ce87512ead051da3a36019b78ebc675c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
#A simple script to predict gnomAD ancestry using PCA loadings trained on gnomAD V3 datasets | |
#See here for details: https://gnomad.broadinstitute.org/news/2021-09-using-the-gnomad-ancestry-principal-components-analysis-loadings-and-random-forest-classifier-on-your-dataset/ | |
#Author: Anand Mayakonda | |
import sys | |
import os.path | |
import shutil | |
import argparse | |
parser = argparse.ArgumentParser( | |
description="Predict ancestry with gnomad ancestry RF models") | |
parser.add_argument("-v", "--vcf", help="Input VCF",required=True) | |
parser.add_argument("-p", "--pca", help="PCA loadings",required=True) | |
parser.add_argument("-r", "--rf", help="Rnadom forest models", required=True) | |
parser.add_argument("-b", "--name", help="Output basename", required=True) | |
args = parser.parse_args() | |
VCF = args.vcf #Input VCF file | |
pcloadings = args.pca #gnomad PCA loadings | |
rfmodel = args.rf #gnomad RF model | |
vcfbase = args.name #Output file basename | |
#temp_directory=sys.argv[5] #do not use /tmp | |
import hail as hl | |
import sklearn | |
import pickle | |
from gnomad.sample_qc.ancestry import assign_population_pcs | |
hl.init(tmp_dir="./") | |
loadings_ht = hl.read_table(pcloadings) | |
mtout = vcfbase + ".mt" | |
hl.import_vcf(VCF, reference_genome='GRCh38', force_bgz=True,array_elements_required=False).write(mtout, overwrite=True) | |
mt_to_project = hl.read_matrix_table(mtout) | |
ht = hl.experimental.pc_project( | |
mt_to_project.GT, | |
loadings_ht.loadings, | |
loadings_ht.pca_af, | |
) | |
with hl.hadoop_open(rfmodel, "rb") as f: | |
fit = pickle.load(f) | |
num_pcs = fit.n_features_in_ | |
ht = ht.annotate(scores=ht.scores[:num_pcs]) | |
#vcf doesnt have the column 'known_pop' required by hail | |
ht = ht.annotate(known_pop="NA") | |
ht, rf_model = assign_population_pcs( | |
ht, | |
pc_cols=ht.scores, | |
fit=fit, | |
) | |
ht.export(vcfbase+'.rf.est.tsv', delimiter='\t') | |
print("Cleaning up..") | |
shutil.rmtree(mtout, ignore_errors=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment