Last active
March 13, 2018 03:29
-
-
Save nilesh-tawari/d5c15fb6f12f6b76216f3b17febf41cd to your computer and use it in GitHub Desktop.
check_excel
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Tue Mar 13 09:40:36 2018 | |
@author: rameshtn | |
""" | |
from __future__ import print_function | |
import os | |
import argparse | |
import pandas as pd | |
from pandas.util.testing import assert_frame_equal | |
# Parameters | |
parser = argparse.ArgumentParser(description = 'Compare the old and new excel ' \ | |
'files generated by CET') | |
parser.add_argument('old', help='Input call old excel filename') | |
parser.add_argument('new', help='Input call new excel filename') | |
args = parser.parse_args() | |
old = os.path.abspath(args.old) | |
new = os.path.abspath(args.new) | |
# functions | |
def assertFrameEqual(df1, df2): | |
""" Assert that two dataframes are equal, ignoring ordering of columns""" | |
return assert_frame_equal(df1.sort_index(axis=1), df2.sort_index(axis=1), check_names=True) | |
def diff(first, second): | |
second = set(second) | |
return [item for item in first if item not in second] | |
# phenotypes should be exactly same | |
df_new_ph = pd.read_excel(new, sheetname = 'Phenotypes_details', header=0) | |
df_old_ph = pd.read_excel(old, sheetname = 'Phenotypes_details', header=0) | |
assertFrameEqual(df_new_ph, df_old_ph) | |
# gene coverage should be exactly same | |
df_new_ge = pd.read_excel(new, sheetname = 'Gene_wise_coverage', header=0) | |
df_old_ge = pd.read_excel(old, sheetname = 'Gene_wise_coverage', header=0) | |
assertFrameEqual(df_new_ge, df_old_ge) | |
# variants should be exactly same both in variants and transcripts | |
df_new_va = pd.read_excel(new, sheetname = 'Filtered_variants', header=0) | |
df_old_va = pd.read_excel(old, sheetname = 'Filtered_variants', header=0) | |
assert df_new_va['variant_ID'].equals(df_new_va['variant_ID']) | |
df_new_tr = pd.read_excel(new, sheetname = 'Transcript_details', header=0) | |
df_old_tr = pd.read_excel(old, sheetname = 'Transcript_details', header=0) | |
assert df_new_tr['variant_ID'].equals(df_new_tr['variant_ID']) | |
# check columns | |
new_col = list(df_new_va.columns) | |
old_col = list(df_old_va.columns) | |
# there should be no new column | |
new_cols = diff(new_col, old_col) | |
assert len(new_cols) == 0 | |
missing_cols = diff(old_col, new_col) | |
# some cols should be missing | |
mis_cols = ['GM12878_confidence_value', | |
'GM12878_fitCons_score', | |
'GenoCanyon_score', | |
'H1-hESC_confidence_value', | |
'H1-hESC_fitCons_score', | |
'HUVEC_confidence_value', | |
'HUVEC_fitCons_score', | |
'VEST3_score', | |
'fathmm-MKL_coding_group', | |
'fathmm-MKL_coding_pred', | |
'fathmm-MKL_coding_score', | |
'integrated_confidence_value', | |
'integrated_fitCons_score', | |
'phastCons20way_mammalian', | |
'phyloP20way_mammalian'] | |
assert len(diff(missing_cols, mis_cols)) == 0 | |
# samples should be same | |
new_samples = set([col.split(':')[0] for col in new_col if ":" in col]) | |
old_samples = set([col.split(':')[0] for col in old_col if ":" in col]) | |
assert len(diff(new_samples, old_samples)) == 0 | |
print("Passed equality check !!!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment