Created
July 18, 2019 15:02
-
-
Save 108krohan/89b3f1114aeeeab533f0b0d4035be788 to your computer and use it in GitHub Desktop.
Execution format: `python check_assumptions.py -i refsnp-chr-XYZ.json.bz2`
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import json | |
import bz2 | |
import time | |
def is_pltp_uniq(info): | |
''' | |
checks if PLTP is true at only 1 place, i.e. primary in any JSON | |
''' | |
pltp_count = 0 | |
for alleleinfo in info: | |
# has top level placement (ptlp) and assembly info | |
if alleleinfo['is_ptlp'] : | |
pltp_count += 1 | |
if(pltp_count == 1) : | |
return True | |
else : | |
return False | |
def is_start_and_pos_const(info): | |
''' | |
checks if start and pos are same across all variants reported in any JSON | |
''' | |
diff_start_count = 0 | |
diff_pos_count = 0 | |
start = -1 | |
pos = -1 | |
for alleleinfo in info: | |
# has top level placement (ptlp) and assembly info | |
if alleleinfo['is_ptlp'] and \ | |
len(alleleinfo['placement_annot'] | |
['seq_id_traits_by_assembly']) > 0: | |
assembly_name = (alleleinfo['placement_annot'] | |
['seq_id_traits_by_assembly'] | |
[0]['assembly_name']) | |
for a in alleleinfo['alleles']: | |
spdi = a['allele']['spdi'] | |
if(start == -1) : | |
start = spdi['seq_id'] | |
pos = spdi['position'] | |
continue | |
if(spdi['seq_id'] != start) : | |
diff_start_count += 1 | |
if(spdi['position'] != pos) : | |
diff_pos_count += 1 | |
if(diff_pos_count == 0 and diff_start_count == 0) : | |
return True | |
else : | |
return False | |
parser = argparse.ArgumentParser( | |
description='Example of parsing JSON RefSNP Data') | |
parser.add_argument( | |
'-i', dest='input_fn', required=True, | |
help='The name of the input file to parse') | |
args = parser.parse_args() | |
start_time = time.time() | |
cnt = 0 | |
pltp_multiple = 0 | |
start_and_pos_diff = 0 | |
with bz2.BZ2File(args.input_fn, 'rb') as f_in: | |
for line in f_in: | |
rs_obj = json.loads(line.decode('utf-8')) | |
if 'primary_snapshot_data' in rs_obj: | |
placements_with_allele = rs_obj['primary_snapshot_data']['placements_with_allele'] | |
if(not is_pltp_uniq(placements_with_allele)) : | |
pltp_multiple += 1 | |
if(not is_start_and_pos_const(placements_with_allele)) : | |
start_and_pos_diff += 1 | |
cnt = cnt + 1 | |
#print("Progress = " + str(cnt)) | |
##if (cnt > 10000): | |
## break | |
print("Lines scanned = " + str(cnt)) | |
print("PLTP were found to be non-unique in " | |
+ str(pltp_multiple) + " instances... \n\tPercentage samples affected = " + str((pltp_multiple * 100)/cnt)) | |
print("CONTIG and POS differed in " | |
+ str(start_and_pos_diff) | |
+ " instances... \n\tPercentage samples affected = " + str((start_and_pos_diff * 100)/cnt)) | |
print("Execution time: " + str(round((time.time() - start_time)/60, 2)) + " mins.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment