nilesh-tawari · March 13, 2018 03:29
diff --git a/compare_excel.py b/compare_excel.py
 # -*- coding: utf-8 -*-
 """
 Created on Tue Mar 13 09:40:36 2018

 @author: rameshtn
 """

 from __future__ import print_function
 import os
 import argparse
 import pandas as pd
 from pandas.util.testing import assert_frame_equal

 # Parameters
 parser = argparse.ArgumentParser(description = 'Compare the old and new excel ' \
                                                'files generated by CET')
 parser.add_argument('old', help='Input call old excel filename')
 parser.add_argument('new', help='Input call new excel filename')

 args = parser.parse_args()
 old = os.path.abspath(args.old)
 new = os.path.abspath(args.new)

 # functions
 def assertFrameEqual(df1, df2):
    """ Assert that two dataframes are equal, ignoring ordering of columns"""
    return assert_frame_equal(df1.sort_index(axis=1), df2.sort_index(axis=1), check_names=True)

 def diff(first, second):
        second = set(second)
        return [item for item in first if item not in second]
    

 # phenotypes should be exactly same
 df_new_ph = pd.read_excel(new, sheetname = 'Phenotypes_details', header=0)
 df_old_ph = pd.read_excel(old, sheetname = 'Phenotypes_details', header=0)
 assertFrameEqual(df_new_ph, df_old_ph)

 # gene coverage should be exactly same
 df_new_ge = pd.read_excel(new, sheetname = 'Gene_wise_coverage', header=0)
 df_old_ge = pd.read_excel(old, sheetname = 'Gene_wise_coverage', header=0)
 assertFrameEqual(df_new_ge, df_old_ge)

 # variants should be exactly same both in variants and transcripts 
 df_new_va = pd.read_excel(new, sheetname = 'Filtered_variants', header=0)
 df_old_va = pd.read_excel(old, sheetname = 'Filtered_variants', header=0)
 assert df_new_va['variant_ID'].equals(df_new_va['variant_ID'])

 df_new_tr = pd.read_excel(new, sheetname = 'Transcript_details', header=0)
 df_old_tr = pd.read_excel(old, sheetname = 'Transcript_details', header=0)
 assert df_new_tr['variant_ID'].equals(df_new_tr['variant_ID'])

 # check columns
 new_col = list(df_new_va.columns)
 old_col = list(df_old_va.columns)

 # there should be no new column
 new_cols = diff(new_col, old_col)
 assert len(new_cols) == 0
 missing_cols = diff(old_col, new_col)

 # some cols should be missing
 mis_cols = ['GM12878_confidence_value',
 'GM12878_fitCons_score',
 'GenoCanyon_score',
 'H1-hESC_confidence_value',
 'H1-hESC_fitCons_score',
 'HUVEC_confidence_value',
 'HUVEC_fitCons_score',
 'VEST3_score',
 'fathmm-MKL_coding_group',
 'fathmm-MKL_coding_pred',
 'fathmm-MKL_coding_score',
 'integrated_confidence_value',
 'integrated_fitCons_score',
 'phastCons20way_mammalian',
 'phyloP20way_mammalian']

 assert len(diff(missing_cols, mis_cols)) == 0

 # samples should be same
 new_samples = set([col.split(':')[0] for col in new_col if ":" in col])
 old_samples = set([col.split(':')[0] for col in old_col if ":" in col])
 assert len(diff(new_samples, old_samples)) == 0

 print("Passed equality check !!!")
	# -- coding: utf-8 --
	"""
	Created on Tue Mar 13 09:40:36 2018

	@author: rameshtn
	"""

	from __future__ import print_function
	import os
	import argparse
	import pandas as pd
	from pandas.util.testing import assert_frame_equal

	# Parameters
	parser = argparse.ArgumentParser(description = 'Compare the old and new excel ' \
	'files generated by CET')
	parser.add_argument('old', help='Input call old excel filename')
	parser.add_argument('new', help='Input call new excel filename')

	args = parser.parse_args()
	old = os.path.abspath(args.old)
	new = os.path.abspath(args.new)

	# functions
	def assertFrameEqual(df1, df2):
	""" Assert that two dataframes are equal, ignoring ordering of columns"""
	return assert_frame_equal(df1.sort_index(axis=1), df2.sort_index(axis=1), check_names=True)

	def diff(first, second):
	second = set(second)
	return [item for item in first if item not in second]


	# phenotypes should be exactly same
	df_new_ph = pd.read_excel(new, sheetname = 'Phenotypes_details', header=0)
	df_old_ph = pd.read_excel(old, sheetname = 'Phenotypes_details', header=0)
	assertFrameEqual(df_new_ph, df_old_ph)

	# gene coverage should be exactly same
	df_new_ge = pd.read_excel(new, sheetname = 'Gene_wise_coverage', header=0)
	df_old_ge = pd.read_excel(old, sheetname = 'Gene_wise_coverage', header=0)
	assertFrameEqual(df_new_ge, df_old_ge)

	# variants should be exactly same both in variants and transcripts
	df_new_va = pd.read_excel(new, sheetname = 'Filtered_variants', header=0)
	df_old_va = pd.read_excel(old, sheetname = 'Filtered_variants', header=0)
	assert df_new_va['variant_ID'].equals(df_new_va['variant_ID'])

	df_new_tr = pd.read_excel(new, sheetname = 'Transcript_details', header=0)
	df_old_tr = pd.read_excel(old, sheetname = 'Transcript_details', header=0)
	assert df_new_tr['variant_ID'].equals(df_new_tr['variant_ID'])

	# check columns
	new_col = list(df_new_va.columns)
	old_col = list(df_old_va.columns)

	# there should be no new column
	new_cols = diff(new_col, old_col)
	assert len(new_cols) == 0
	missing_cols = diff(old_col, new_col)

	# some cols should be missing
	mis_cols = ['GM12878_confidence_value',
	'GM12878_fitCons_score',
	'GenoCanyon_score',
	'H1-hESC_confidence_value',
	'H1-hESC_fitCons_score',
	'HUVEC_confidence_value',
	'HUVEC_fitCons_score',
	'VEST3_score',
	'fathmm-MKL_coding_group',
	'fathmm-MKL_coding_pred',
	'fathmm-MKL_coding_score',
	'integrated_confidence_value',
	'integrated_fitCons_score',
	'phastCons20way_mammalian',
	'phyloP20way_mammalian']

	assert len(diff(missing_cols, mis_cols)) == 0

	# samples should be same
	new_samples = set([col.split(':')[0] for col in new_col if ":" in col])
	old_samples = set([col.split(':')[0] for col in old_col if ":" in col])
	assert len(diff(new_samples, old_samples)) == 0

	print("Passed equality check !!!")