Created
February 18, 2025 11:41
-
-
Save Sejmou/e12a674fff1198c89757ff1ef62abac8 to your computer and use it in GitHub Desktop.
A quick command line script for finding code cells in a notebook producing large outputs (inflating overall notebook size)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nbformat as nbf | |
from typing import TypedDict | |
class CodeCellMeta(TypedDict): | |
cell_num: int | |
output_size_bytes: int | |
first_lines: list[str] | |
def get_code_cell_metadata(nb_path: str): | |
ntbk = nbf.read(nb_path, nbf.NO_CONVERT) | |
cell_metas: list[CodeCellMeta] = [] | |
for i, cell in enumerate(ntbk.cells): | |
cell_num = i + 1 | |
if cell.cell_type == "code": | |
meta: CodeCellMeta = { | |
"output_size_bytes": len(str(cell.outputs)), | |
"cell_num": cell_num, | |
"first_lines": cell.source.split("\n")[:5], | |
} | |
cell_metas.append(meta) | |
return cell_metas | |
def human_readable_size(size_bytes: int) -> str: | |
size_current_unit: float = size_bytes | |
for unit in ["B", "KB", "MB", "GB", "TB"]: | |
if size_current_unit < 1024: | |
return f"{size_current_unit:.2f} {unit}" | |
size_current_unit /= 1024.0 | |
return f"{size_current_unit:.2f} PB" | |
def show_large_cells(nb_path: str): | |
code_cell_meta = get_code_cell_metadata(nb_path) | |
cell_meta_by_size_est = sorted( | |
code_cell_meta, key=lambda x: x["output_size_bytes"], reverse=True | |
) | |
bytes_remaining = sum([el["output_size_bytes"] for el in cell_meta_by_size_est]) | |
for i, el in enumerate(cell_meta_by_size_est): | |
print(f"Cell #{el['cell_num']}: {human_readable_size(el['output_size_bytes'])}") | |
print("\n".join(el["first_lines"])) | |
print("\n") | |
bytes_remaining -= el["output_size_bytes"] | |
if i != len(cell_meta_by_size_est) - 1: | |
input( | |
f"Remaining cell outputs account for {human_readable_size(bytes_remaining)} total. Hit enter to view info for next cell." | |
) | |
else: | |
print("No more cells to view.") | |
if __name__ == "__main__": | |
import sys | |
try: | |
nb_path = sys.argv[1] | |
if not nb_path.endswith(".ipynb"): | |
raise ValueError("Please provide a path to a Jupyter notebook file.") | |
except IndexError: | |
raise ValueError("Please provide a path to a Jupyter notebook file.") | |
show_large_cells(nb_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment