Last active
January 2, 2025 19:44
-
-
Save dnk8n/925db332579a539f9dea37301906fd45 to your computer and use it in GitHub Desktop.
Use openpyxl to return a dictreader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
from typing import Any, Callable, Dict, Iterator, List, Optional, Union | |
from openpyxl import load_workbook | |
def xlsx_dictreader( | |
filename: Union[str, Path], | |
sheet_index: int = 0, | |
header_row_index: int = 1, | |
data_start_row_index: int = 2, | |
data_only: bool = True, | |
post_process_funcs: Optional[List[Callable[[Any], Any]]] = None, | |
null_vals: List[Union[None, str]] = [None, "None"], | |
) -> Iterator[Dict[str, Any]]: | |
book = load_workbook(filename, data_only=data_only) | |
sheet = book.worksheets[sheet_index] | |
header = [ | |
c | |
for c in (cell.value for cell in sheet[header_row_index]) | |
if c not in null_vals | |
] | |
if not post_process_funcs: | |
post_process_funcs = [lambda x: x] * len(header) | |
else: | |
if len(post_process_funcs) != len(header): | |
raise Exception( | |
"post-processing functions do not line up with headers" | |
) | |
for row_idx in range(data_start_row_index, sheet.max_row + 1): | |
candidate = {} | |
for col_idx in range(1, sheet.max_column + 1): | |
if col_idx <= len(header): | |
candidate[header[col_idx - 1]] = post_process_funcs[ | |
col_idx - 1 | |
](sheet.cell(row=row_idx, column=col_idx).value) | |
if not all(value in null_vals for value in candidate.values()): | |
yield candidate |
@splosarek I found and fixed another issue that caused the last dict item to be dropped.
Yeah, I found it too and applied exactly the same fix :)
Works beautifully! Thank you so much!
Glad it can be of use.
My advice is to build up some unit tests for your use case. Using code in the wild always comes with a bit of risk, so it is useful to guarantee so far as you can, that it is working well for your use case.
Don't just run and hope, as the initial author might have had another set of scenarios and not thought about the edge cases present in yours!
Thanks for the alert and for giving me an opportunity to see where I went wrong unknowingly :)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you! I'll work with it and let you know in case of issues.