peterk87 · February 9, 2018 15:39
diff --git a/to_excel.py b/to_excel.py
 import argparse
 import re
 import logging
 import pandas as pd
 import numpy as np
 import sys

 from utils import init_console_logger

 PROGRAM_NAME = 'to_excel'
 PROGRAM_VERSION = '0.1.0'
 PROGRAM_DESC = """Write multiple tabular files to an Excel file

 Each tabular file will be added to its own worksheet.

 Handled file types:
 - tab-delimited ("tabular", "tsv", "txt")
 - comma-separated values ("csv")

 Features:
 - Width of columns automatically adjusted to match contents (i.e. "auto-fit")
 - Optional metadata sheet with info about input tabular files, sheet names, etc


 """

 # Regex to find unallowed characters in Excel worksheet names
 REGEX_UNALLOWED_EXCEL_WS_CHARS = re.compile(r'[\\:/?*\[\]]+')

 # Optional metadata worksheet column names
 METADATA_COLUMNS = """
 Index
 File Path
 Filename
 Original Sheet Name
 Final Sheet Name
 """.strip().split('\n')

 # File type to Pandas read function
 FILE_TYPE_TO_PD_READ = dict(
    tabular=pd.read_table,
    csv=pd.read_csv,
    tsv=pd.read_table,
    txt=pd.read_table
 )



 def get_col_widths(df, index=False):
    """Calculate column widths based on column headers and contents"""
    if index:
        idx_max = max([len(str(s)) for s in df.index.values] + [len(str(df.index.name))])
        yield idx_max
    for c in df.columns:
        # get max length of column contents and length of column header
        yield np.max([df[c].astype(str).str.len().max(), len(c)])


 def init_parser():
    parser = argparse.ArgumentParser(prog=PROGRAM_NAME,
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     description=PROGRAM_DESC)

    parser.add_argument('-i', '--input',
                        nargs=4,
                        metavar=('filepath', 'filename', 'filetype', 'sheetname'),
                        action='append',
                        required=True,
                        help='Filepath, filename, file type (one of "tabular", "tsv", "csv", "txt"), Excel worksheet name')
    parser.add_argument('-o', '--output',
                        required=True)
    parser.add_argument('-v', '--verbose',
                        action='count',
                        default=0,
                        help='Logging verbosity level (-v == show warnings; -vvv == show debug info)')
    parser.add_argument('--no-sheet-name-index',
                        action='store_true',
                        help='Do not add index to Excel worksheet name')
    parser.add_argument('--write-index',
                        action='store_true',
                        help='Write index for each tabular file')
    parser.add_argument('--add-metadata-sheet',
                        action='store_true',
                        help='Add worksheet with metadata about tabular data files written to Excel workbook')
    parser.add_argument('-V', '--version',
                        action='version',
                        version='%(prog)s {}'.format(PROGRAM_VERSION))
    return parser


 def main():
    parser = init_parser()
    if len(sys.argv[1:]) == 0:
        parser.print_help()
        parser.exit()
    args = parser.parse_args()
    init_console_logger(args.verbose)

    pd_to_excel_kwargs = {}
    if not args.write_index:
        pd_to_excel_kwargs['index'] = None

    logging.info('Starting to write tabular data to worksheets in Excel workbook')
    with pd.ExcelWriter(args.output, engine='xlsxwriter') as writer:
        metadata = []
        idx = 1
        for input in args.input:
            if not isinstance(input, (list, tuple)):
                logging.error('Input "%s" is not a list or tuple (type="%s"). Skipping...', 
                    input, 
                    type(input))
                continue
            filepath, filename, filetype, sheetname = input
            if filetype not in FILE_TYPE_TO_PD_READ:
                logging.error('Unhandled filetype "%s" for "%s". Skipping...', 
                    filetype, 
                    filename)
                continue

            logging.info('Reading table "%s" into Pandas DataFrame', filepath)
            df = FILE_TYPE_TO_PD_READ[filetype](filepath, dtype=str)

            fixed_sheetname = REGEX_UNALLOWED_EXCEL_WS_CHARS.sub('_', sheetname)
            # fixed max number of characters in sheet name due to compatibility
            if args.no_sheet_name_index:
                max_chars = 31
                fixed_sheetname = fixed_sheetname[:max_chars]
            else:
                max_chars = 28
                fixed_sheetname = '{}_{}'.format(idx, fixed_sheetname[:max_chars])

            if len(fixed_sheetname) > max_chars:
                logging.warning('Sheetname "%s" is >= %s characters so may be truncated (n=%s)',
                                max_chars,
                                fixed_sheetname,
                                len(fixed_sheetname))

            logging.info('Writing table to Excel sheet "{}"'.format(fixed_sheetname))
            df.to_excel(writer, sheet_name=fixed_sheetname, **pd_to_excel_kwargs)
            worksheet = writer.book.get_worksheet_by_name(fixed_sheetname)
            for i, width in enumerate(get_col_widths(df, index=args.write_index)):
                worksheet.set_column(i, i, width)

            metadata.append((idx, filepath, filename, sheetname, fixed_sheetname))
            idx += 1
        logging.info('Wrote the following worksheets: %s', metadata)
        if args.add_metadata_sheet:
            df_md = pd.DataFrame(metadata)
            df_md.columns = METADATA_COLUMNS
            metadata_sheet_name = '__metadata__'
            df_md.to_excel(writer, index=None, sheet_name=metadata_sheet_name)
            worksheet = writer.book.get_worksheet_by_name(metadata_sheet_name)
            for i, width in enumerate(get_col_widths(df_md, index=False)):
                worksheet.set_column(i, i, width)
            logging.info('Wrote worksheet "%s" for storing input metadata.', metadata_sheet_name)

    logging.info('Done.')


 if __name__ == '__main__':
    main()
	import argparse
	import re
	import logging
	import pandas as pd
	import numpy as np
	import sys

	from utils import init_console_logger

	PROGRAM_NAME = 'to_excel'
	PROGRAM_VERSION = '0.1.0'
	PROGRAM_DESC = """Write multiple tabular files to an Excel file

	Each tabular file will be added to its own worksheet.

	Handled file types:
	- tab-delimited ("tabular", "tsv", "txt")
	- comma-separated values ("csv")

	Features:
	- Width of columns automatically adjusted to match contents (i.e. "auto-fit")
	- Optional metadata sheet with info about input tabular files, sheet names, etc


	"""

	# Regex to find unallowed characters in Excel worksheet names
	REGEX_UNALLOWED_EXCEL_WS_CHARS = re.compile(r'[\\:/?*\[\]]+')

	# Optional metadata worksheet column names
	METADATA_COLUMNS = """
	Index
	File Path
	Filename
	Original Sheet Name
	Final Sheet Name
	""".strip().split('\n')

	# File type to Pandas read function
	FILE_TYPE_TO_PD_READ = dict(
	tabular=pd.read_table,
	csv=pd.read_csv,
	tsv=pd.read_table,
	txt=pd.read_table
	)



	def get_col_widths(df, index=False):
	"""Calculate column widths based on column headers and contents"""
	if index:
	idx_max = max([len(str(s)) for s in df.index.values] + [len(str(df.index.name))])
	yield idx_max
	for c in df.columns:
	# get max length of column contents and length of column header
	yield np.max([df[c].astype(str).str.len().max(), len(c)])


	def init_parser():
	parser = argparse.ArgumentParser(prog=PROGRAM_NAME,
	formatter_class=argparse.RawDescriptionHelpFormatter,
	description=PROGRAM_DESC)

	parser.add_argument('-i', '--input',
	nargs=4,
	metavar=('filepath', 'filename', 'filetype', 'sheetname'),
	action='append',
	required=True,
	help='Filepath, filename, file type (one of "tabular", "tsv", "csv", "txt"), Excel worksheet name')
	parser.add_argument('-o', '--output',
	required=True)
	parser.add_argument('-v', '--verbose',
	action='count',
	default=0,
	help='Logging verbosity level (-v == show warnings; -vvv == show debug info)')
	parser.add_argument('--no-sheet-name-index',
	action='store_true',
	help='Do not add index to Excel worksheet name')
	parser.add_argument('--write-index',
	action='store_true',
	help='Write index for each tabular file')
	parser.add_argument('--add-metadata-sheet',
	action='store_true',
	help='Add worksheet with metadata about tabular data files written to Excel workbook')
	parser.add_argument('-V', '--version',
	action='version',
	version='%(prog)s {}'.format(PROGRAM_VERSION))
	return parser


	def main():
	parser = init_parser()
	if len(sys.argv[1:]) == 0:
	parser.print_help()
	parser.exit()
	args = parser.parse_args()
	init_console_logger(args.verbose)

	pd_to_excel_kwargs = {}
	if not args.write_index:
	pd_to_excel_kwargs['index'] = None

	logging.info('Starting to write tabular data to worksheets in Excel workbook')
	with pd.ExcelWriter(args.output, engine='xlsxwriter') as writer:
	metadata = []
	idx = 1
	for input in args.input:
	if not isinstance(input, (list, tuple)):
	logging.error('Input "%s" is not a list or tuple (type="%s"). Skipping...',
	input,
	type(input))
	continue
	filepath, filename, filetype, sheetname = input
	if filetype not in FILE_TYPE_TO_PD_READ:
	logging.error('Unhandled filetype "%s" for "%s". Skipping...',
	filetype,
	filename)
	continue

	logging.info('Reading table "%s" into Pandas DataFrame', filepath)
	df = FILE_TYPE_TO_PD_READ[filetype](filepath, dtype=str)

	fixed_sheetname = REGEX_UNALLOWED_EXCEL_WS_CHARS.sub('_', sheetname)
	# fixed max number of characters in sheet name due to compatibility
	if args.no_sheet_name_index:
	max_chars = 31
	fixed_sheetname = fixed_sheetname[:max_chars]
	else:
	max_chars = 28
	fixed_sheetname = '{}_{}'.format(idx, fixed_sheetname[:max_chars])

	if len(fixed_sheetname) > max_chars:
	logging.warning('Sheetname "%s" is >= %s characters so may be truncated (n=%s)',
	max_chars,
	fixed_sheetname,
	len(fixed_sheetname))

	logging.info('Writing table to Excel sheet "{}"'.format(fixed_sheetname))
	df.to_excel(writer, sheet_name=fixed_sheetname, **pd_to_excel_kwargs)
	worksheet = writer.book.get_worksheet_by_name(fixed_sheetname)
	for i, width in enumerate(get_col_widths(df, index=args.write_index)):
	worksheet.set_column(i, i, width)

	metadata.append((idx, filepath, filename, sheetname, fixed_sheetname))
	idx += 1
	logging.info('Wrote the following worksheets: %s', metadata)
	if args.add_metadata_sheet:
	df_md = pd.DataFrame(metadata)
	df_md.columns = METADATA_COLUMNS
	metadata_sheet_name = '__metadata__'
	df_md.to_excel(writer, index=None, sheet_name=metadata_sheet_name)
	worksheet = writer.book.get_worksheet_by_name(metadata_sheet_name)
	for i, width in enumerate(get_col_widths(df_md, index=False)):
	worksheet.set_column(i, i, width)
	logging.info('Wrote worksheet "%s" for storing input metadata.', metadata_sheet_name)

	logging.info('Done.')


	if __name__ == '__main__':
	main()