Skip to content

Instantly share code, notes, and snippets.

@peterk87
Created February 9, 2018 15:39
Show Gist options
  • Save peterk87/b73720598dfa983659f43821f1eca0a0 to your computer and use it in GitHub Desktop.
Save peterk87/b73720598dfa983659f43821f1eca0a0 to your computer and use it in GitHub Desktop.
Write multiple tabular files to an Excel XLSX file
import argparse
import re
import logging
import pandas as pd
import numpy as np
import sys
from utils import init_console_logger
PROGRAM_NAME = 'to_excel'
PROGRAM_VERSION = '0.1.0'
PROGRAM_DESC = """Write multiple tabular files to an Excel file
Each tabular file will be added to its own worksheet.
Handled file types:
- tab-delimited ("tabular", "tsv", "txt")
- comma-separated values ("csv")
Features:
- Width of columns automatically adjusted to match contents (i.e. "auto-fit")
- Optional metadata sheet with info about input tabular files, sheet names, etc
"""
# Regex to find unallowed characters in Excel worksheet names
REGEX_UNALLOWED_EXCEL_WS_CHARS = re.compile(r'[\\:/?*\[\]]+')
# Optional metadata worksheet column names
METADATA_COLUMNS = """
Index
File Path
Filename
Original Sheet Name
Final Sheet Name
""".strip().split('\n')
# File type to Pandas read function
FILE_TYPE_TO_PD_READ = dict(
tabular=pd.read_table,
csv=pd.read_csv,
tsv=pd.read_table,
txt=pd.read_table
)
def get_col_widths(df, index=False):
"""Calculate column widths based on column headers and contents"""
if index:
idx_max = max([len(str(s)) for s in df.index.values] + [len(str(df.index.name))])
yield idx_max
for c in df.columns:
# get max length of column contents and length of column header
yield np.max([df[c].astype(str).str.len().max(), len(c)])
def init_parser():
parser = argparse.ArgumentParser(prog=PROGRAM_NAME,
formatter_class=argparse.RawDescriptionHelpFormatter,
description=PROGRAM_DESC)
parser.add_argument('-i', '--input',
nargs=4,
metavar=('filepath', 'filename', 'filetype', 'sheetname'),
action='append',
required=True,
help='Filepath, filename, file type (one of "tabular", "tsv", "csv", "txt"), Excel worksheet name')
parser.add_argument('-o', '--output',
required=True)
parser.add_argument('-v', '--verbose',
action='count',
default=0,
help='Logging verbosity level (-v == show warnings; -vvv == show debug info)')
parser.add_argument('--no-sheet-name-index',
action='store_true',
help='Do not add index to Excel worksheet name')
parser.add_argument('--write-index',
action='store_true',
help='Write index for each tabular file')
parser.add_argument('--add-metadata-sheet',
action='store_true',
help='Add worksheet with metadata about tabular data files written to Excel workbook')
parser.add_argument('-V', '--version',
action='version',
version='%(prog)s {}'.format(PROGRAM_VERSION))
return parser
def main():
parser = init_parser()
if len(sys.argv[1:]) == 0:
parser.print_help()
parser.exit()
args = parser.parse_args()
init_console_logger(args.verbose)
pd_to_excel_kwargs = {}
if not args.write_index:
pd_to_excel_kwargs['index'] = None
logging.info('Starting to write tabular data to worksheets in Excel workbook')
with pd.ExcelWriter(args.output, engine='xlsxwriter') as writer:
metadata = []
idx = 1
for input in args.input:
if not isinstance(input, (list, tuple)):
logging.error('Input "%s" is not a list or tuple (type="%s"). Skipping...',
input,
type(input))
continue
filepath, filename, filetype, sheetname = input
if filetype not in FILE_TYPE_TO_PD_READ:
logging.error('Unhandled filetype "%s" for "%s". Skipping...',
filetype,
filename)
continue
logging.info('Reading table "%s" into Pandas DataFrame', filepath)
df = FILE_TYPE_TO_PD_READ[filetype](filepath, dtype=str)
fixed_sheetname = REGEX_UNALLOWED_EXCEL_WS_CHARS.sub('_', sheetname)
# fixed max number of characters in sheet name due to compatibility
if args.no_sheet_name_index:
max_chars = 31
fixed_sheetname = fixed_sheetname[:max_chars]
else:
max_chars = 28
fixed_sheetname = '{}_{}'.format(idx, fixed_sheetname[:max_chars])
if len(fixed_sheetname) > max_chars:
logging.warning('Sheetname "%s" is >= %s characters so may be truncated (n=%s)',
max_chars,
fixed_sheetname,
len(fixed_sheetname))
logging.info('Writing table to Excel sheet "{}"'.format(fixed_sheetname))
df.to_excel(writer, sheet_name=fixed_sheetname, **pd_to_excel_kwargs)
worksheet = writer.book.get_worksheet_by_name(fixed_sheetname)
for i, width in enumerate(get_col_widths(df, index=args.write_index)):
worksheet.set_column(i, i, width)
metadata.append((idx, filepath, filename, sheetname, fixed_sheetname))
idx += 1
logging.info('Wrote the following worksheets: %s', metadata)
if args.add_metadata_sheet:
df_md = pd.DataFrame(metadata)
df_md.columns = METADATA_COLUMNS
metadata_sheet_name = '__metadata__'
df_md.to_excel(writer, index=None, sheet_name=metadata_sheet_name)
worksheet = writer.book.get_worksheet_by_name(metadata_sheet_name)
for i, width in enumerate(get_col_widths(df_md, index=False)):
worksheet.set_column(i, i, width)
logging.info('Wrote worksheet "%s" for storing input metadata.', metadata_sheet_name)
logging.info('Done.')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment