Skip to content

Instantly share code, notes, and snippets.

@Sejmou
Created February 18, 2025 11:41
Show Gist options
  • Save Sejmou/e12a674fff1198c89757ff1ef62abac8 to your computer and use it in GitHub Desktop.
Save Sejmou/e12a674fff1198c89757ff1ef62abac8 to your computer and use it in GitHub Desktop.
A quick command line script for finding code cells in a notebook producing large outputs (inflating overall notebook size)
import nbformat as nbf
from typing import TypedDict
class CodeCellMeta(TypedDict):
cell_num: int
output_size_bytes: int
first_lines: list[str]
def get_code_cell_metadata(nb_path: str):
ntbk = nbf.read(nb_path, nbf.NO_CONVERT)
cell_metas: list[CodeCellMeta] = []
for i, cell in enumerate(ntbk.cells):
cell_num = i + 1
if cell.cell_type == "code":
meta: CodeCellMeta = {
"output_size_bytes": len(str(cell.outputs)),
"cell_num": cell_num,
"first_lines": cell.source.split("\n")[:5],
}
cell_metas.append(meta)
return cell_metas
def human_readable_size(size_bytes: int) -> str:
size_current_unit: float = size_bytes
for unit in ["B", "KB", "MB", "GB", "TB"]:
if size_current_unit < 1024:
return f"{size_current_unit:.2f} {unit}"
size_current_unit /= 1024.0
return f"{size_current_unit:.2f} PB"
def show_large_cells(nb_path: str):
code_cell_meta = get_code_cell_metadata(nb_path)
cell_meta_by_size_est = sorted(
code_cell_meta, key=lambda x: x["output_size_bytes"], reverse=True
)
bytes_remaining = sum([el["output_size_bytes"] for el in cell_meta_by_size_est])
for i, el in enumerate(cell_meta_by_size_est):
print(f"Cell #{el['cell_num']}: {human_readable_size(el['output_size_bytes'])}")
print("\n".join(el["first_lines"]))
print("\n")
bytes_remaining -= el["output_size_bytes"]
if i != len(cell_meta_by_size_est) - 1:
input(
f"Remaining cell outputs account for {human_readable_size(bytes_remaining)} total. Hit enter to view info for next cell."
)
else:
print("No more cells to view.")
if __name__ == "__main__":
import sys
try:
nb_path = sys.argv[1]
if not nb_path.endswith(".ipynb"):
raise ValueError("Please provide a path to a Jupyter notebook file.")
except IndexError:
raise ValueError("Please provide a path to a Jupyter notebook file.")
show_large_cells(nb_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment