blakeNaccarato · January 12, 2024 04:35
diff --git a/generate_lab_data.py b/generate_lab_data.py
 """
 From all Excel workbooks in the working directory, generate multiple distinct Excel
 workbooks in subfolders with different data and computed results removed. Retain an
 instructor's key for each distinct dataset.

 - Remove cell names.
 - Remove sheets matching "pivot".
 - Multiply bold cells by a factor from a Gaussian distribution and unbold them.
 - Clear italic cells.
 - Clear table column formulas.
 """


 import os
 import random
 from decimal import Decimal
 from glob import glob
 from typing import List, Tuple

 import openpyxl
 import pyxltab
 from openpyxl.cell.cell import TYPE_NUMERIC, Cell
 from openpyxl.styles import Font
 from openpyxl.workbook.workbook import Workbook


 # * ------------------------------------------------------------------------------ # *
 # * MAIN * #


 def main():
    """
    From all Excel workbooks in the working directory, generate multiple distinct Excel
    workbooks in subfolders with different data and computed results removed. Retain an
    instructor's key for each distinct dataset.

    - Remove cell names.
    - Remove sheets matching "pivot".
    - Multiply bold cells by a factor from a Gaussian distribution and unbold them.
    - Clear italic cells.
    - Clear table column formulas.
    """

    # Remove sheets matching "pivot".
    match = "pivot"

    # Of the cells modified, 95% will fall within 96% and 104% of the original value.
    # Over 99% will fall between 94% and 106%.
    factor_mean = 1
    factor_std = 0.02

    # Generate this many files with the suffix starting at a certain number and
    # incrementing by one with each file. Useful for appending course section numbers.
    num_files = 4
    suffix_begin = 1001

    in_files = glob("[!~$]*.xlsx")  # Don't include temp XLSX files marked by "~$"
    for in_file in in_files:
        out_files_key, out_files_student = generate_out_files(
            in_file, num_files, suffix_begin
        )
        for out_key, out_student in zip(out_files_key, out_files_student):
            modify_book(in_file, out_key, out_student, match, factor_mean, factor_std)


 # * ------------------------------------------------------------------------------ # *
 # * PRIMARY FUNCTIONS * #


 def generate_out_files(
    in_filename: str, num_files: int, suffix_begin: int
 ) -> Tuple[List[str], List[str]]:
    """
    Generates output filenames given an input filename. Make directories as needed.
    """

    (in_file, ext) = os.path.splitext(in_filename)
    out_root = os.path.abspath(in_file)
    repeated_args = (out_root, in_file, num_files, suffix_begin, ext)

    out_files_key = generate_out_files_helper(*repeated_args, additional_suffix="key")
    out_files_student = generate_out_files_helper(*repeated_args)

    return (out_files_key, out_files_student)


 def modify_book(
    book_path_in: str,
    book_path_key: str,
    book_path_student: str,
    match: str,
    factor_mean: float,
    factor_std: float,
 ):
    """
    From one master workbook, generate a workbook with different data and computed
    results removed. "Bold" cells will be modified, "italic" cells will be cleared.
    Sheets containing the word "pivot" will be deleted.
    """

    # First just apply Gaussian scaling and save the instructor's key.
    openpyxl_book = openpyxl.load_workbook(book_path_in)
    gaussian_scale_bold(openpyxl_book, factor_mean, factor_std)
    openpyxl_book.save(book_path_key)

    # Then, clear all computed values and remove formatting.
    remove_cell_names(openpyxl_book)
    remove_matching_sheets(openpyxl_book, match)
    format_and_clear_formulas(openpyxl_book)
    openpyxl_book.save(book_path_student)


 # * ------------------------------------------------------------------------------ # *
 # * SECONDARY FUNCTIONS * #


 def generate_out_files_helper(
    out_root: str,
    in_file: str,
    num_files: int,
    suffix_begin: int,
    ext: str,
    additional_suffix: str = "",
 ) -> List[str]:
    """
    Generate output files based on the input file.
    """

    if additional_suffix != "":
        additional_suffix = "_" + additional_suffix

    out_files = [
        os.path.join(
            out_root, in_file + "_" + str(suffix_begin + num) + additional_suffix + ext
        )
        for num in range(num_files)
    ]

    if not os.path.exists(out_root):
        os.makedirs(out_root)

    return out_files


 def remove_cell_names(book: Workbook):
    """
    Remove all defined names from the workbook. Does not remove table references.
    """

    defined_names = book.defined_names
    names = [dn.name for dn in defined_names.definedName]
    for name in names:
        del defined_names[name]


 def remove_matching_sheets(book: Workbook, match: str):
    """
    Remove all sheets in the workbook that match the supplied string.
    """

    match = match.lower()
    for name in book.sheetnames:
        if match in name.lower():
            del book[name]


 def gaussian_scale_bold(book: Workbook, factor_mean: float, factor_std: float):
    """
    Get the instructor key workbok by modifying bold, numeric cell values by a random
    factor from a Gaussian distribution.
    """

    tables = pyxltab.get_tables(book)
    tables_cells = [table.get_cells() for table in tables.values()]

    for table_cells in tables_cells:
        for col_cells in table_cells:
            for cell in col_cells:
                if cell.data_type == TYPE_NUMERIC and cell.font.bold:
                    modify_cell_gauss(cell, factor_mean, factor_std)


 def format_and_clear_formulas(book: Workbook):
    """
    Clear italic cells. Unbold an unitalicize. If a table column has a formula, clear
    all cells in that column, whether or not they were italicized. Clear column
    formulas.
    """

    not_bold = Font(bold=False)
    not_italic = Font(italic=False)
    tables = pyxltab.get_tables(book)
    tables_cells = [table.get_cells() for table in tables.values()]

    for (table, table_cells) in zip(tables.values(), tables_cells):
        for (col, col_cells) in zip(table.columns.values(), table_cells):
            col_formula = col.openpyxl_column.calculatedColumnFormula
            for cell in col_cells:

                # Unbold
                if cell.data_type == TYPE_NUMERIC and cell.font.bold:
                    cell.font = not_bold

                # Clear italic cells. Unitalicize.
                elif cell.font.italic:
                    cell.value = None
                    cell.font = not_italic

                # If a table column has a formula, clear all cells in that column,
                # whether or not they were italicized.
                if cell.data_type == TYPE_NUMERIC and col_formula:
                    cell.value = None

            # Clear column formulas.
            if col_formula:
                col.openpyxl_column.calculatedColumnFormula = None


 # * ------------------------------------------------------------------------------ # *
 # * TERTIARY FUNCTIONS * #


 def modify_cell_gauss(cell: Cell, factor_mean: float, factor_std: float):
    """
    Change cell value by a random factor from a Gaussian distribution.
    """

    num_places = get_num_places(cell)
    value = Decimal(cell.value)
    factor = Decimal(random.gauss(factor_mean, factor_std))
    cell.value = (value * factor).quantize(num_places)


 def get_num_places(cell: Cell) -> Decimal:
    """
    For the given cell, get the number of places beyond the decimal point. Return a
    pattern to be used with `Decimal.quantize(result)` to adjust other numbers to the
    same precision.
    """

    if "." in str(cell.value):
        digits_after_decimal = len(str(cell.value).split(".")[-1])
    else:
        digits_after_decimal = 0
    num_places = Decimal(10) ** -digits_after_decimal
    return num_places


 # * ------------------------------------------------------------------------------ # *
 # * RUN MAIN * #

 # Run the `main()` function defined above if this script is directly invoked.
 if __name__ == "__main__":
    main()
diff --git a/requirements.txt b/requirements.txt
 openpyxl
 pyxltab
	"""
	From all Excel workbooks in the working directory, generate multiple distinct Excel
	workbooks in subfolders with different data and computed results removed. Retain an
	instructor's key for each distinct dataset.

	- Remove cell names.
	- Remove sheets matching "pivot".
	- Multiply bold cells by a factor from a Gaussian distribution and unbold them.
	- Clear italic cells.
	- Clear table column formulas.
	"""


	import os
	import random
	from decimal import Decimal
	from glob import glob
	from typing import List, Tuple

	import openpyxl
	import pyxltab
	from openpyxl.cell.cell import TYPE_NUMERIC, Cell
	from openpyxl.styles import Font
	from openpyxl.workbook.workbook import Workbook


	# * ------------------------------------------------------------------------------ # *
	# * MAIN * #


	def main():
	"""
	From all Excel workbooks in the working directory, generate multiple distinct Excel
	workbooks in subfolders with different data and computed results removed. Retain an
	instructor's key for each distinct dataset.

	- Remove cell names.
	- Remove sheets matching "pivot".
	- Multiply bold cells by a factor from a Gaussian distribution and unbold them.
	- Clear italic cells.
	- Clear table column formulas.
	"""

	# Remove sheets matching "pivot".
	match = "pivot"

	# Of the cells modified, 95% will fall within 96% and 104% of the original value.
	# Over 99% will fall between 94% and 106%.
	factor_mean = 1
	factor_std = 0.02

	# Generate this many files with the suffix starting at a certain number and
	# incrementing by one with each file. Useful for appending course section numbers.
	num_files = 4
	suffix_begin = 1001

	in_files = glob("[!~$]*.xlsx") # Don't include temp XLSX files marked by "~$"
	for in_file in in_files:
	out_files_key, out_files_student = generate_out_files(
	in_file, num_files, suffix_begin
	)
	for out_key, out_student in zip(out_files_key, out_files_student):
	modify_book(in_file, out_key, out_student, match, factor_mean, factor_std)


	# * ------------------------------------------------------------------------------ # *
	# * PRIMARY FUNCTIONS * #


	def generate_out_files(
	in_filename: str, num_files: int, suffix_begin: int
	) -> Tuple[List[str], List[str]]:
	"""
	Generates output filenames given an input filename. Make directories as needed.
	"""

	(in_file, ext) = os.path.splitext(in_filename)
	out_root = os.path.abspath(in_file)
	repeated_args = (out_root, in_file, num_files, suffix_begin, ext)

	out_files_key = generate_out_files_helper(*repeated_args, additional_suffix="key")
	out_files_student = generate_out_files_helper(*repeated_args)

	return (out_files_key, out_files_student)


	def modify_book(
	book_path_in: str,
	book_path_key: str,
	book_path_student: str,
	match: str,
	factor_mean: float,
	factor_std: float,
	):
	"""
	From one master workbook, generate a workbook with different data and computed
	results removed. "Bold" cells will be modified, "italic" cells will be cleared.
	Sheets containing the word "pivot" will be deleted.
	"""

	# First just apply Gaussian scaling and save the instructor's key.
	openpyxl_book = openpyxl.load_workbook(book_path_in)
	gaussian_scale_bold(openpyxl_book, factor_mean, factor_std)
	openpyxl_book.save(book_path_key)

	# Then, clear all computed values and remove formatting.
	remove_cell_names(openpyxl_book)
	remove_matching_sheets(openpyxl_book, match)
	format_and_clear_formulas(openpyxl_book)
	openpyxl_book.save(book_path_student)


	# * ------------------------------------------------------------------------------ # *
	# * SECONDARY FUNCTIONS * #


	def generate_out_files_helper(
	out_root: str,
	in_file: str,
	num_files: int,
	suffix_begin: int,
	ext: str,
	additional_suffix: str = "",
	) -> List[str]:
	"""
	Generate output files based on the input file.
	"""

	if additional_suffix != "":
	additional_suffix = "_" + additional_suffix

	out_files = [
	os.path.join(
	out_root, in_file + "_" + str(suffix_begin + num) + additional_suffix + ext
	)
	for num in range(num_files)
	]

	if not os.path.exists(out_root):
	os.makedirs(out_root)

	return out_files


	def remove_cell_names(book: Workbook):
	"""
	Remove all defined names from the workbook. Does not remove table references.
	"""

	defined_names = book.defined_names
	names = [dn.name for dn in defined_names.definedName]
	for name in names:
	del defined_names[name]


	def remove_matching_sheets(book: Workbook, match: str):
	"""
	Remove all sheets in the workbook that match the supplied string.
	"""

	match = match.lower()
	for name in book.sheetnames:
	if match in name.lower():
	del book[name]


	def gaussian_scale_bold(book: Workbook, factor_mean: float, factor_std: float):
	"""
	Get the instructor key workbok by modifying bold, numeric cell values by a random
	factor from a Gaussian distribution.
	"""

	tables = pyxltab.get_tables(book)
	tables_cells = [table.get_cells() for table in tables.values()]

	for table_cells in tables_cells:
	for col_cells in table_cells:
	for cell in col_cells:
	if cell.data_type == TYPE_NUMERIC and cell.font.bold:
	modify_cell_gauss(cell, factor_mean, factor_std)


	def format_and_clear_formulas(book: Workbook):
	"""
	Clear italic cells. Unbold an unitalicize. If a table column has a formula, clear
	all cells in that column, whether or not they were italicized. Clear column
	formulas.
	"""

	not_bold = Font(bold=False)
	not_italic = Font(italic=False)
	tables = pyxltab.get_tables(book)
	tables_cells = [table.get_cells() for table in tables.values()]

	for (table, table_cells) in zip(tables.values(), tables_cells):
	for (col, col_cells) in zip(table.columns.values(), table_cells):
	col_formula = col.openpyxl_column.calculatedColumnFormula
	for cell in col_cells:

	# Unbold
	if cell.data_type == TYPE_NUMERIC and cell.font.bold:
	cell.font = not_bold

	# Clear italic cells. Unitalicize.
	elif cell.font.italic:
	cell.value = None
	cell.font = not_italic

	# If a table column has a formula, clear all cells in that column,
	# whether or not they were italicized.
	if cell.data_type == TYPE_NUMERIC and col_formula:
	cell.value = None

	# Clear column formulas.
	if col_formula:
	col.openpyxl_column.calculatedColumnFormula = None


	# * ------------------------------------------------------------------------------ # *
	# * TERTIARY FUNCTIONS * #


	def modify_cell_gauss(cell: Cell, factor_mean: float, factor_std: float):
	"""
	Change cell value by a random factor from a Gaussian distribution.
	"""

	num_places = get_num_places(cell)
	value = Decimal(cell.value)
	factor = Decimal(random.gauss(factor_mean, factor_std))
	cell.value = (value * factor).quantize(num_places)


	def get_num_places(cell: Cell) -> Decimal:
	"""
	For the given cell, get the number of places beyond the decimal point. Return a
	pattern to be used with `Decimal.quantize(result)` to adjust other numbers to the
	same precision.
	"""

	if "." in str(cell.value):
	digits_after_decimal = len(str(cell.value).split(".")[-1])
	else:
	digits_after_decimal = 0
	num_places = Decimal(10) ** -digits_after_decimal
	return num_places


	# * ------------------------------------------------------------------------------ # *
	# * RUN MAIN * #

	# Run the `main()` function defined above if this script is directly invoked.
	if __name__ == "__main__":
	main()
No results found