Created
April 4, 2023 08:39
-
-
Save xyb/0a3023196a77cfd700f2c461d42b91b5 to your computer and use it in GitHub Desktop.
some functions to help read or generate excel files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import io | |
import logging | |
import openpyxl | |
logger = logging.getLogger("excel") | |
def excel_to_csv(path): | |
""" | |
Converts an Excel file to a list of CSV strings, | |
one for each sheet in the Excel file. | |
Args: | |
- path (str): The path to the Excel file to be converted. | |
Returns: | |
- result (list): A list of dictionaries, where each dictionary | |
contains the following keys: | |
- sheet_no (int): The number of the sheet in the Excel file. | |
- sheet_name (str): The name of the sheet in the Excel file. | |
- csv_content (str): The CSV content of the sheet. | |
Raises: | |
- FileNotFoundError: If the Excel file does not exist. | |
- openpyxl.utils.exceptions.InvalidFileException: If the Excel | |
file is not a valid Excel file. | |
""" | |
wb = openpyxl.load_workbook(path) | |
result = [] | |
for id, sheet_name in enumerate(wb.sheetnames): | |
sheet = wb[sheet_name] | |
csv_content = io.StringIO() | |
csv_writer = csv.writer(csv_content) | |
for row in sheet.iter_rows(values_only=True): | |
csv_writer.writerow(row) | |
result.append( | |
dict( | |
sheet_no=id + 1, | |
sheet_name=sheet_name, | |
csv_content=csv_content.getvalue(), | |
), | |
) | |
return result | |
def sheets_to_excel(path, sheets): | |
""" | |
Write all sheets into a excel file. | |
It takes in two parameters: path and sheets. The path parameter is the path | |
to the excel file. The sheets parameter is a list of tuples, where each tuple | |
contains the sheet name and the csv content. The csv content is in csv format, | |
where the first line is the field names and the other lines are the data itself. | |
Example Usage: | |
content = 'a,b,c\\n1,2,3\\n4,5,6\\n' | |
excel_path = '/tmp/test.xlsx' | |
sheets_to_excel(excel_path, [('test sheet', content)]) | |
Parameters: | |
- path (str): The path to the excel file. | |
- sheets (list): A list of tuples, where each tuple contains the sheet name | |
and the csv content. | |
Returns: | |
- None | |
>>> content = 'a,b,c\\n1,2,3\\n' | |
>>> excel_path = '/tmp/test.xlsx' | |
>>> sheets_to_excel(excel_path, [('test', content)]) | |
>>> excel_to_csv(excel_path) | |
[{'sheet_no': 1, 'sheet_name': 'test', 'csv_content': 'a,b,c\\r\\n1,2,3\\r\\n'}] | |
>>> sheets_to_excel(excel_path, []) | |
>>> excel_to_csv(excel_path) | |
[{'sheet_no': 1, 'sheet_name': 'Sheet', 'csv_content': ''}] | |
""" | |
workbook = openpyxl.Workbook() | |
default_sheet = workbook["Sheet"] | |
if sheets: | |
workbook.remove(default_sheet) | |
for sheet in sheets: | |
sheet_name, csv_content = sheet | |
worksheet = workbook.create_sheet(title=sheet_name) | |
reader = csv.reader(csv_content.split("\n")) | |
for row_index, row in enumerate(reader): | |
for col_index, cell_value in enumerate(row): | |
worksheet.cell( | |
row=row_index + 1, | |
column=col_index + 1, | |
value=cell_value, | |
) | |
workbook.save(path) | |
def merge_same_cell(xlsx_filename): | |
""" | |
>>> content = 'a,b,c\\na,b,e\\na,d,e' | |
>>> excel_path = '/tmp/test.xlsx' | |
>>> sheets_to_excel(excel_path, [('test', content)]) | |
>>> excel_to_csv(excel_path) | |
[... 'csv_content': 'a,b,c\\r\\na,b,e\\r\\na,d,e\\r\\n'}] | |
>>> merge_same_cell(excel_path) | |
>>> excel_to_csv(excel_path) | |
[... 'csv_content': 'a,b,c\\r\\n,,e\\r\\n,d,\\r\\n'}] | |
""" | |
wb = openpyxl.load_workbook(xlsx_filename) | |
for sheet in wb: | |
merge_list = [] | |
for col in sheet.columns: | |
start_row = 1 | |
for i in range(1, len(col)): | |
if col[i].value != col[i - 1].value: | |
if start_row != i: | |
cell_range = ( | |
sheet.cell(row=start_row, column=col[0].column).coordinate | |
+ ":" | |
+ sheet.cell(row=i, column=col[0].column).coordinate | |
) | |
merge_list.append(cell_range) | |
start_row = i + 1 | |
if start_row != len(col) + 1: | |
cell_range = ( | |
sheet.cell(row=start_row, column=col[0].column).coordinate | |
+ ":" | |
+ sheet.cell(row=len(col), column=col[0].column).coordinate | |
) | |
merge_list.append(cell_range) | |
for cell_range in merge_list: | |
if cell_range in sheet.merged_cells: | |
sheet.unmerge_cells(cell_range) | |
sheet.merge_cells(cell_range) | |
wb.save(xlsx_filename) | |
def adjust_col_width_to_fit_content(xlsx_filename): | |
""" | |
Adjusts the column width of an Excel file to fit its content. | |
Args: | |
- xlsx_filename (str): The path to the Excel file. | |
Returns: | |
- None | |
""" | |
wb = openpyxl.load_workbook(xlsx_filename) | |
for sheet in wb: | |
for col in sheet.columns: | |
max_length = 0 | |
column = col[0].column_letter # Get the column name | |
for cell in col: | |
try: # Necessary to avoid error on empty cells | |
if len(str(cell.value)) > max_length: | |
max_length = len(cell.value) | |
except Exception: | |
pass | |
adjusted_width = (max_length + 2) * 1.2 # Adjust the width | |
sheet.column_dimensions[column].width = adjusted_width | |
wb.save(xlsx_filename) | |
def auto_remove_unused_cols_and_rows(xlsx_filename): | |
wb = openpyxl.load_workbook(xlsx_filename) | |
for sheet in wb: | |
# Get the range of cells that contain data | |
used_range = sheet.calculate_dimension() | |
# Get the maximum row and column indices in the used range | |
max_row, min_col, max_col, min_row = openpyxl.utils.cell.range_boundaries( | |
used_range, | |
) | |
# Identify the header row | |
header_row = None | |
for row in range(min_row, max_row + 1): | |
if all(cell.value is None for cell in sheet[row]): | |
header_row = row | |
else: | |
break | |
# Delete any unused columns | |
for col in range(max_col, sheet.max_column + 1): | |
if all( | |
sheet[row][col].value is None for row in range(min_row, max_row + 1) | |
): | |
sheet.delete_cols(col) | |
# Delete any unused rows | |
if header_row is not None: | |
for row in range(max_row, header_row - 1, -1): | |
if all( | |
sheet[row][col].value is None for col in range(min_col, max_col + 1) | |
): | |
sheet.delete_rows(row) | |
else: | |
for row in range(max_row, 0, -1): | |
if all( | |
sheet[row][col].value is None for col in range(min_col, max_col + 1) | |
): | |
sheet.delete_rows(row) | |
# Unmerge any merged cells before deleting them | |
for merged_cell_range in sheet.merged_cells.ranges: | |
merged_cell_range.unmerge() | |
wb.save(xlsx_filename) | |
def auto_height_row(xlsx_filename): | |
wb = openpyxl.load_workbook(xlsx_filename) | |
for sheet in wb: | |
for row in sheet.rows: | |
max_height = 0 | |
for cell in row: | |
try: | |
if cell.value: | |
cell_height = len(str(cell.value)) * 1.2 | |
if cell_height > max_height: | |
max_height = cell_height | |
except Exception: | |
pass | |
if max_height > 0: | |
sheet.row_dimensions[row[0].row].height = max_height | |
wb.save(xlsx_filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment