Last active
January 12, 2024 04:35
-
-
Save blakeNaccarato/078c45c4cf6dd6cff4e8dd3819dc9830 to your computer and use it in GitHub Desktop.
Generate data for different lab sections.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
From all Excel workbooks in the working directory, generate multiple distinct Excel | |
workbooks in subfolders with different data and computed results removed. Retain an | |
instructor's key for each distinct dataset. | |
- Remove cell names. | |
- Remove sheets matching "pivot". | |
- Multiply bold cells by a factor from a Gaussian distribution and unbold them. | |
- Clear italic cells. | |
- Clear table column formulas. | |
""" | |
import os | |
import random | |
from decimal import Decimal | |
from glob import glob | |
from typing import List, Tuple | |
import openpyxl | |
import pyxltab | |
from openpyxl.cell.cell import TYPE_NUMERIC, Cell | |
from openpyxl.styles import Font | |
from openpyxl.workbook.workbook import Workbook | |
# * ------------------------------------------------------------------------------ # * | |
# * MAIN * # | |
def main(): | |
""" | |
From all Excel workbooks in the working directory, generate multiple distinct Excel | |
workbooks in subfolders with different data and computed results removed. Retain an | |
instructor's key for each distinct dataset. | |
- Remove cell names. | |
- Remove sheets matching "pivot". | |
- Multiply bold cells by a factor from a Gaussian distribution and unbold them. | |
- Clear italic cells. | |
- Clear table column formulas. | |
""" | |
# Remove sheets matching "pivot". | |
match = "pivot" | |
# Of the cells modified, 95% will fall within 96% and 104% of the original value. | |
# Over 99% will fall between 94% and 106%. | |
factor_mean = 1 | |
factor_std = 0.02 | |
# Generate this many files with the suffix starting at a certain number and | |
# incrementing by one with each file. Useful for appending course section numbers. | |
num_files = 4 | |
suffix_begin = 1001 | |
in_files = glob("[!~$]*.xlsx") # Don't include temp XLSX files marked by "~$" | |
for in_file in in_files: | |
out_files_key, out_files_student = generate_out_files( | |
in_file, num_files, suffix_begin | |
) | |
for out_key, out_student in zip(out_files_key, out_files_student): | |
modify_book(in_file, out_key, out_student, match, factor_mean, factor_std) | |
# * ------------------------------------------------------------------------------ # * | |
# * PRIMARY FUNCTIONS * # | |
def generate_out_files( | |
in_filename: str, num_files: int, suffix_begin: int | |
) -> Tuple[List[str], List[str]]: | |
""" | |
Generates output filenames given an input filename. Make directories as needed. | |
""" | |
(in_file, ext) = os.path.splitext(in_filename) | |
out_root = os.path.abspath(in_file) | |
repeated_args = (out_root, in_file, num_files, suffix_begin, ext) | |
out_files_key = generate_out_files_helper(*repeated_args, additional_suffix="key") | |
out_files_student = generate_out_files_helper(*repeated_args) | |
return (out_files_key, out_files_student) | |
def modify_book( | |
book_path_in: str, | |
book_path_key: str, | |
book_path_student: str, | |
match: str, | |
factor_mean: float, | |
factor_std: float, | |
): | |
""" | |
From one master workbook, generate a workbook with different data and computed | |
results removed. "Bold" cells will be modified, "italic" cells will be cleared. | |
Sheets containing the word "pivot" will be deleted. | |
""" | |
# First just apply Gaussian scaling and save the instructor's key. | |
openpyxl_book = openpyxl.load_workbook(book_path_in) | |
gaussian_scale_bold(openpyxl_book, factor_mean, factor_std) | |
openpyxl_book.save(book_path_key) | |
# Then, clear all computed values and remove formatting. | |
remove_cell_names(openpyxl_book) | |
remove_matching_sheets(openpyxl_book, match) | |
format_and_clear_formulas(openpyxl_book) | |
openpyxl_book.save(book_path_student) | |
# * ------------------------------------------------------------------------------ # * | |
# * SECONDARY FUNCTIONS * # | |
def generate_out_files_helper( | |
out_root: str, | |
in_file: str, | |
num_files: int, | |
suffix_begin: int, | |
ext: str, | |
additional_suffix: str = "", | |
) -> List[str]: | |
""" | |
Generate output files based on the input file. | |
""" | |
if additional_suffix != "": | |
additional_suffix = "_" + additional_suffix | |
out_files = [ | |
os.path.join( | |
out_root, in_file + "_" + str(suffix_begin + num) + additional_suffix + ext | |
) | |
for num in range(num_files) | |
] | |
if not os.path.exists(out_root): | |
os.makedirs(out_root) | |
return out_files | |
def remove_cell_names(book: Workbook): | |
""" | |
Remove all defined names from the workbook. Does not remove table references. | |
""" | |
defined_names = book.defined_names | |
names = [dn.name for dn in defined_names.definedName] | |
for name in names: | |
del defined_names[name] | |
def remove_matching_sheets(book: Workbook, match: str): | |
""" | |
Remove all sheets in the workbook that match the supplied string. | |
""" | |
match = match.lower() | |
for name in book.sheetnames: | |
if match in name.lower(): | |
del book[name] | |
def gaussian_scale_bold(book: Workbook, factor_mean: float, factor_std: float): | |
""" | |
Get the instructor key workbok by modifying bold, numeric cell values by a random | |
factor from a Gaussian distribution. | |
""" | |
tables = pyxltab.get_tables(book) | |
tables_cells = [table.get_cells() for table in tables.values()] | |
for table_cells in tables_cells: | |
for col_cells in table_cells: | |
for cell in col_cells: | |
if cell.data_type == TYPE_NUMERIC and cell.font.bold: | |
modify_cell_gauss(cell, factor_mean, factor_std) | |
def format_and_clear_formulas(book: Workbook): | |
""" | |
Clear italic cells. Unbold an unitalicize. If a table column has a formula, clear | |
all cells in that column, whether or not they were italicized. Clear column | |
formulas. | |
""" | |
not_bold = Font(bold=False) | |
not_italic = Font(italic=False) | |
tables = pyxltab.get_tables(book) | |
tables_cells = [table.get_cells() for table in tables.values()] | |
for (table, table_cells) in zip(tables.values(), tables_cells): | |
for (col, col_cells) in zip(table.columns.values(), table_cells): | |
col_formula = col.openpyxl_column.calculatedColumnFormula | |
for cell in col_cells: | |
# Unbold | |
if cell.data_type == TYPE_NUMERIC and cell.font.bold: | |
cell.font = not_bold | |
# Clear italic cells. Unitalicize. | |
elif cell.font.italic: | |
cell.value = None | |
cell.font = not_italic | |
# If a table column has a formula, clear all cells in that column, | |
# whether or not they were italicized. | |
if cell.data_type == TYPE_NUMERIC and col_formula: | |
cell.value = None | |
# Clear column formulas. | |
if col_formula: | |
col.openpyxl_column.calculatedColumnFormula = None | |
# * ------------------------------------------------------------------------------ # * | |
# * TERTIARY FUNCTIONS * # | |
def modify_cell_gauss(cell: Cell, factor_mean: float, factor_std: float): | |
""" | |
Change cell value by a random factor from a Gaussian distribution. | |
""" | |
num_places = get_num_places(cell) | |
value = Decimal(cell.value) | |
factor = Decimal(random.gauss(factor_mean, factor_std)) | |
cell.value = (value * factor).quantize(num_places) | |
def get_num_places(cell: Cell) -> Decimal: | |
""" | |
For the given cell, get the number of places beyond the decimal point. Return a | |
pattern to be used with `Decimal.quantize(result)` to adjust other numbers to the | |
same precision. | |
""" | |
if "." in str(cell.value): | |
digits_after_decimal = len(str(cell.value).split(".")[-1]) | |
else: | |
digits_after_decimal = 0 | |
num_places = Decimal(10) ** -digits_after_decimal | |
return num_places | |
# * ------------------------------------------------------------------------------ # * | |
# * RUN MAIN * # | |
# Run the `main()` function defined above if this script is directly invoked. | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
openpyxl | |
pyxltab |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment