Skip to content

Instantly share code, notes, and snippets.

@stuaxo
Last active August 26, 2022 01:20
Show Gist options
  • Save stuaxo/6670036 to your computer and use it in GitHub Desktop.
Save stuaxo/6670036 to your computer and use it in GitHub Desktop.
Output an excel workbook as a python dict.Information about the formatting/colour of cells is lost - which is OK if the sheet is sensible. Individual rows are output horizontally.Types are converted to python types.This is not comprehensive + just enough for the sheet I happen to be working with today.
import datetime
import xlrd
from pprint import PrettyPrinter
def trim_nones(li):
""" remove trailing Nones from a list """
while li and li[-1] is None:
li.pop()
return li
def clean_cell(cell, worksheet):
"""
Convert supported xlrd cell types into sensible python types.
"""
if not isinstance(cell, xlrd.sheet.Cell):
# Non cells can be passed through, for testing.
return cell
if cell.ctype in (xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK):
return None
elif cell.ctype == xlrd.XL_CELL_BOOLEAN:
return bool(cell.value)
elif cell.ctype == xlrd.XL_CELL_DATE:
workbook = worksheet.book
if not cell:
return None
dt_tuple = xlrd.xldate_as_tuple(float(cell.value), workbook.datemode)
ymd = dt_tuple[:3]
hms = dt_tuple[3:]
if ymd == (0, 0, 0):
# No date, so this is just a time.
return datetime.time(*hms)
# We have no way to know if midnight was specified in a datetime
# or if no time was specified and it was just a date, so default
# to a datetime:
return datetime.datetime(*dt_tuple)
elif cell.ctype == xlrd.XL_CELL_NUMBER:
return float(cell.value)
elif cell.ctype == xlrd.XL_CELL_TEXT:
return unicode(cell.value)
def clean_row(row, worksheet, **kwargs):
"""
Clean cells in a row.
:param row: Row to clean.
:param worksheet: Worksheet that row belongs to.
:param kwargs: May specify trim_nones to True to trim trailing None values.
"""
result = []
for cell in row:
result.append(clean_cell(cell, worksheet))
if kwargs.get('trim') == True:
return trim_nones(result)
else:
return result
def clean_sheet(worksheet, skip=False, **kwargs):
for curr_row in xrange(worksheet.nrows):
row = clean_row(worksheet.row(curr_row), worksheet, **kwargs)
if skip and (''.join([unicode(cell or '') for cell in row]) or '').strip() == '':
continue
yield row
def workbook_to_dict(workbook, worksheets=None, **kwargs):
result = {}
if worksheets is None:
worksheets = workbook.sheets()
for worksheet in worksheets:
result[worksheet.name] = list(clean_sheet(worksheet, **kwargs))
return result
class SheetPrinter(PrettyPrinter):
"""
Prints rows horizontally.
"""
def format(self, object, context, maxlevels, level):
if isinstance(object, list) and level == 2:
return object, True, False
else:
return PrettyPrinter.format(self, object, context,
maxlevels, level)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Excel workbook to python dict.')
parser.add_argument('workbook', help='Excel workbook to convert')
parser.add_argument('--trim', action='store_true', default=False, help='Trim trailing None values on Rows')
parser.add_argument('--skip', action='store_true', default=False, help='Skip empty Rows')
args = parser.parse_args()
if args.workbook:
workbook = xlrd.open_workbook(args.workbook)
SheetPrinter().pprint(workbook_to_dict(workbook, trim=args.trim, skip=args.skip))
@Grijesh-Chauhan
Copy link

I would suggest to add check for error cell in clean_cell function, check if cell.ctype == XL_CELL_ERROR: return either None, or ValueError("error value") or raise ValueError("Error fount in cell")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment