108krohan · July 18, 2019 15:02
diff --git a/check_assumptions.py b/check_assumptions.py
 import argparse
 import json
 import bz2
 import time

 def is_pltp_uniq(info):
    '''
    checks if PLTP is true at only 1 place, i.e. primary in any JSON
    '''
    pltp_count = 0

    for alleleinfo in info:
        # has top level placement (ptlp) and assembly info
        if alleleinfo['is_ptlp'] :
            pltp_count += 1

    if(pltp_count == 1) : 
        return True
    else :
        return False

 def is_start_and_pos_const(info):
    '''
    checks if start and pos are same across all variants reported in any JSON
    '''
    diff_start_count = 0
    diff_pos_count = 0
    start = -1 
    pos = -1
    for alleleinfo in info:
        # has top level placement (ptlp) and assembly info
        if alleleinfo['is_ptlp'] and \
                len(alleleinfo['placement_annot']
                    ['seq_id_traits_by_assembly']) > 0:
            assembly_name = (alleleinfo['placement_annot']
                                       ['seq_id_traits_by_assembly']
                                       [0]['assembly_name'])

            for a in alleleinfo['alleles']:
                spdi = a['allele']['spdi']
                if(start == -1) : 
                    start = spdi['seq_id']
                    pos = spdi['position']
                    continue
                if(spdi['seq_id'] != start) : 
                    diff_start_count += 1
                if(spdi['position'] != pos) : 
                    diff_pos_count += 1
    if(diff_pos_count == 0 and diff_start_count == 0) : 
        return True
    else :
        return False 

 parser = argparse.ArgumentParser(
    description='Example of parsing JSON RefSNP Data')
 parser.add_argument(
    '-i', dest='input_fn', required=True,
    help='The name of the input file to parse')

 args = parser.parse_args()

 start_time = time.time()
 cnt = 0
 pltp_multiple = 0
 start_and_pos_diff = 0
 with bz2.BZ2File(args.input_fn, 'rb') as f_in:

    for line in f_in:
        rs_obj = json.loads(line.decode('utf-8'))

        if 'primary_snapshot_data' in rs_obj:
            placements_with_allele = rs_obj['primary_snapshot_data']['placements_with_allele'] 
            if(not is_pltp_uniq(placements_with_allele)) :
                pltp_multiple += 1
            if(not is_start_and_pos_const(placements_with_allele)) :
                start_and_pos_diff += 1
        cnt = cnt + 1
        #print("Progress = " + str(cnt))
        ##if (cnt > 10000):
        ##    break

 print("Lines scanned = " + str(cnt))
 print("PLTP were found to be non-unique in " 
    + str(pltp_multiple) + " instances... \n\tPercentage samples affected = " + str((pltp_multiple * 100)/cnt))
 print("CONTIG and POS differed in " 
    + str(start_and_pos_diff) 
    + " instances... \n\tPercentage samples affected = " + str((start_and_pos_diff * 100)/cnt))
 print("Execution time: " + str(round((time.time() - start_time)/60, 2)) + " mins.")
	import argparse
	import json
	import bz2
	import time

	def is_pltp_uniq(info):
	'''
	checks if PLTP is true at only 1 place, i.e. primary in any JSON
	'''
	pltp_count = 0

	for alleleinfo in info:
	# has top level placement (ptlp) and assembly info
	if alleleinfo['is_ptlp'] :
	pltp_count += 1

	if(pltp_count == 1) :
	return True
	else :
	return False

	def is_start_and_pos_const(info):
	'''
	checks if start and pos are same across all variants reported in any JSON
	'''
	diff_start_count = 0
	diff_pos_count = 0
	start = -1
	pos = -1
	for alleleinfo in info:
	# has top level placement (ptlp) and assembly info
	if alleleinfo['is_ptlp'] and \
	len(alleleinfo['placement_annot']
	['seq_id_traits_by_assembly']) > 0:
	assembly_name = (alleleinfo['placement_annot']
	['seq_id_traits_by_assembly']
	[0]['assembly_name'])

	for a in alleleinfo['alleles']:
	spdi = a['allele']['spdi']
	if(start == -1) :
	start = spdi['seq_id']
	pos = spdi['position']
	continue
	if(spdi['seq_id'] != start) :
	diff_start_count += 1
	if(spdi['position'] != pos) :
	diff_pos_count += 1
	if(diff_pos_count == 0 and diff_start_count == 0) :
	return True
	else :
	return False

	parser = argparse.ArgumentParser(
	description='Example of parsing JSON RefSNP Data')
	parser.add_argument(
	'-i', dest='input_fn', required=True,
	help='The name of the input file to parse')

	args = parser.parse_args()

	start_time = time.time()
	cnt = 0
	pltp_multiple = 0
	start_and_pos_diff = 0
	with bz2.BZ2File(args.input_fn, 'rb') as f_in:

	for line in f_in:
	rs_obj = json.loads(line.decode('utf-8'))

	if 'primary_snapshot_data' in rs_obj:
	placements_with_allele = rs_obj['primary_snapshot_data']['placements_with_allele']
	if(not is_pltp_uniq(placements_with_allele)) :
	pltp_multiple += 1
	if(not is_start_and_pos_const(placements_with_allele)) :
	start_and_pos_diff += 1
	cnt = cnt + 1
	#print("Progress = " + str(cnt))
	##if (cnt > 10000):
	## break

	print("Lines scanned = " + str(cnt))
	print("PLTP were found to be non-unique in "
	+ str(pltp_multiple) + " instances... \n\tPercentage samples affected = " + str((pltp_multiple * 100)/cnt))
	print("CONTIG and POS differed in "
	+ str(start_and_pos_diff)
	+ " instances... \n\tPercentage samples affected = " + str((start_and_pos_diff * 100)/cnt))
	print("Execution time: " + str(round((time.time() - start_time)/60, 2)) + " mins.")