Created
February 9, 2018 15:39
-
-
Save peterk87/b73720598dfa983659f43821f1eca0a0 to your computer and use it in GitHub Desktop.
Write multiple tabular files to an Excel XLSX file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import re | |
import logging | |
import pandas as pd | |
import numpy as np | |
import sys | |
from utils import init_console_logger | |
PROGRAM_NAME = 'to_excel' | |
PROGRAM_VERSION = '0.1.0' | |
PROGRAM_DESC = """Write multiple tabular files to an Excel file | |
Each tabular file will be added to its own worksheet. | |
Handled file types: | |
- tab-delimited ("tabular", "tsv", "txt") | |
- comma-separated values ("csv") | |
Features: | |
- Width of columns automatically adjusted to match contents (i.e. "auto-fit") | |
- Optional metadata sheet with info about input tabular files, sheet names, etc | |
""" | |
# Regex to find unallowed characters in Excel worksheet names | |
REGEX_UNALLOWED_EXCEL_WS_CHARS = re.compile(r'[\\:/?*\[\]]+') | |
# Optional metadata worksheet column names | |
METADATA_COLUMNS = """ | |
Index | |
File Path | |
Filename | |
Original Sheet Name | |
Final Sheet Name | |
""".strip().split('\n') | |
# File type to Pandas read function | |
FILE_TYPE_TO_PD_READ = dict( | |
tabular=pd.read_table, | |
csv=pd.read_csv, | |
tsv=pd.read_table, | |
txt=pd.read_table | |
) | |
def get_col_widths(df, index=False): | |
"""Calculate column widths based on column headers and contents""" | |
if index: | |
idx_max = max([len(str(s)) for s in df.index.values] + [len(str(df.index.name))]) | |
yield idx_max | |
for c in df.columns: | |
# get max length of column contents and length of column header | |
yield np.max([df[c].astype(str).str.len().max(), len(c)]) | |
def init_parser(): | |
parser = argparse.ArgumentParser(prog=PROGRAM_NAME, | |
formatter_class=argparse.RawDescriptionHelpFormatter, | |
description=PROGRAM_DESC) | |
parser.add_argument('-i', '--input', | |
nargs=4, | |
metavar=('filepath', 'filename', 'filetype', 'sheetname'), | |
action='append', | |
required=True, | |
help='Filepath, filename, file type (one of "tabular", "tsv", "csv", "txt"), Excel worksheet name') | |
parser.add_argument('-o', '--output', | |
required=True) | |
parser.add_argument('-v', '--verbose', | |
action='count', | |
default=0, | |
help='Logging verbosity level (-v == show warnings; -vvv == show debug info)') | |
parser.add_argument('--no-sheet-name-index', | |
action='store_true', | |
help='Do not add index to Excel worksheet name') | |
parser.add_argument('--write-index', | |
action='store_true', | |
help='Write index for each tabular file') | |
parser.add_argument('--add-metadata-sheet', | |
action='store_true', | |
help='Add worksheet with metadata about tabular data files written to Excel workbook') | |
parser.add_argument('-V', '--version', | |
action='version', | |
version='%(prog)s {}'.format(PROGRAM_VERSION)) | |
return parser | |
def main(): | |
parser = init_parser() | |
if len(sys.argv[1:]) == 0: | |
parser.print_help() | |
parser.exit() | |
args = parser.parse_args() | |
init_console_logger(args.verbose) | |
pd_to_excel_kwargs = {} | |
if not args.write_index: | |
pd_to_excel_kwargs['index'] = None | |
logging.info('Starting to write tabular data to worksheets in Excel workbook') | |
with pd.ExcelWriter(args.output, engine='xlsxwriter') as writer: | |
metadata = [] | |
idx = 1 | |
for input in args.input: | |
if not isinstance(input, (list, tuple)): | |
logging.error('Input "%s" is not a list or tuple (type="%s"). Skipping...', | |
input, | |
type(input)) | |
continue | |
filepath, filename, filetype, sheetname = input | |
if filetype not in FILE_TYPE_TO_PD_READ: | |
logging.error('Unhandled filetype "%s" for "%s". Skipping...', | |
filetype, | |
filename) | |
continue | |
logging.info('Reading table "%s" into Pandas DataFrame', filepath) | |
df = FILE_TYPE_TO_PD_READ[filetype](filepath, dtype=str) | |
fixed_sheetname = REGEX_UNALLOWED_EXCEL_WS_CHARS.sub('_', sheetname) | |
# fixed max number of characters in sheet name due to compatibility | |
if args.no_sheet_name_index: | |
max_chars = 31 | |
fixed_sheetname = fixed_sheetname[:max_chars] | |
else: | |
max_chars = 28 | |
fixed_sheetname = '{}_{}'.format(idx, fixed_sheetname[:max_chars]) | |
if len(fixed_sheetname) > max_chars: | |
logging.warning('Sheetname "%s" is >= %s characters so may be truncated (n=%s)', | |
max_chars, | |
fixed_sheetname, | |
len(fixed_sheetname)) | |
logging.info('Writing table to Excel sheet "{}"'.format(fixed_sheetname)) | |
df.to_excel(writer, sheet_name=fixed_sheetname, **pd_to_excel_kwargs) | |
worksheet = writer.book.get_worksheet_by_name(fixed_sheetname) | |
for i, width in enumerate(get_col_widths(df, index=args.write_index)): | |
worksheet.set_column(i, i, width) | |
metadata.append((idx, filepath, filename, sheetname, fixed_sheetname)) | |
idx += 1 | |
logging.info('Wrote the following worksheets: %s', metadata) | |
if args.add_metadata_sheet: | |
df_md = pd.DataFrame(metadata) | |
df_md.columns = METADATA_COLUMNS | |
metadata_sheet_name = '__metadata__' | |
df_md.to_excel(writer, index=None, sheet_name=metadata_sheet_name) | |
worksheet = writer.book.get_worksheet_by_name(metadata_sheet_name) | |
for i, width in enumerate(get_col_widths(df_md, index=False)): | |
worksheet.set_column(i, i, width) | |
logging.info('Wrote worksheet "%s" for storing input metadata.', metadata_sheet_name) | |
logging.info('Done.') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment