Last active
August 26, 2022 01:20
-
-
Save stuaxo/6670036 to your computer and use it in GitHub Desktop.
Output an excel workbook as a python dict.Information about the formatting/colour of cells is lost - which is OK if the sheet is sensible. Individual rows are output horizontally.Types are converted to python types.This is not comprehensive + just enough for the sheet I happen to be working with today.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datetime | |
import xlrd | |
from pprint import PrettyPrinter | |
def trim_nones(li): | |
""" remove trailing Nones from a list """ | |
while li and li[-1] is None: | |
li.pop() | |
return li | |
def clean_cell(cell, worksheet): | |
""" | |
Convert supported xlrd cell types into sensible python types. | |
""" | |
if not isinstance(cell, xlrd.sheet.Cell): | |
# Non cells can be passed through, for testing. | |
return cell | |
if cell.ctype in (xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK): | |
return None | |
elif cell.ctype == xlrd.XL_CELL_BOOLEAN: | |
return bool(cell.value) | |
elif cell.ctype == xlrd.XL_CELL_DATE: | |
workbook = worksheet.book | |
if not cell: | |
return None | |
dt_tuple = xlrd.xldate_as_tuple(float(cell.value), workbook.datemode) | |
ymd = dt_tuple[:3] | |
hms = dt_tuple[3:] | |
if ymd == (0, 0, 0): | |
# No date, so this is just a time. | |
return datetime.time(*hms) | |
# We have no way to know if midnight was specified in a datetime | |
# or if no time was specified and it was just a date, so default | |
# to a datetime: | |
return datetime.datetime(*dt_tuple) | |
elif cell.ctype == xlrd.XL_CELL_NUMBER: | |
return float(cell.value) | |
elif cell.ctype == xlrd.XL_CELL_TEXT: | |
return unicode(cell.value) | |
def clean_row(row, worksheet, **kwargs): | |
""" | |
Clean cells in a row. | |
:param row: Row to clean. | |
:param worksheet: Worksheet that row belongs to. | |
:param kwargs: May specify trim_nones to True to trim trailing None values. | |
""" | |
result = [] | |
for cell in row: | |
result.append(clean_cell(cell, worksheet)) | |
if kwargs.get('trim') == True: | |
return trim_nones(result) | |
else: | |
return result | |
def clean_sheet(worksheet, skip=False, **kwargs): | |
for curr_row in xrange(worksheet.nrows): | |
row = clean_row(worksheet.row(curr_row), worksheet, **kwargs) | |
if skip and (''.join([unicode(cell or '') for cell in row]) or '').strip() == '': | |
continue | |
yield row | |
def workbook_to_dict(workbook, worksheets=None, **kwargs): | |
result = {} | |
if worksheets is None: | |
worksheets = workbook.sheets() | |
for worksheet in worksheets: | |
result[worksheet.name] = list(clean_sheet(worksheet, **kwargs)) | |
return result | |
class SheetPrinter(PrettyPrinter): | |
""" | |
Prints rows horizontally. | |
""" | |
def format(self, object, context, maxlevels, level): | |
if isinstance(object, list) and level == 2: | |
return object, True, False | |
else: | |
return PrettyPrinter.format(self, object, context, | |
maxlevels, level) | |
if __name__ == '__main__': | |
import argparse | |
parser = argparse.ArgumentParser(description='Excel workbook to python dict.') | |
parser.add_argument('workbook', help='Excel workbook to convert') | |
parser.add_argument('--trim', action='store_true', default=False, help='Trim trailing None values on Rows') | |
parser.add_argument('--skip', action='store_true', default=False, help='Skip empty Rows') | |
args = parser.parse_args() | |
if args.workbook: | |
workbook = xlrd.open_workbook(args.workbook) | |
SheetPrinter().pprint(workbook_to_dict(workbook, trim=args.trim, skip=args.skip)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I would suggest to add check for error cell in
clean_cell
function, checkif cell.ctype == XL_CELL_ERROR:
return eitherNone
, orValueError("error value")
orraise ValueError("Error fount in cell")