I had a similar problem and have created my own script based on Wayne's answer. You can pass it the path to a Jupyter notebook and it will print the code cells with the largest produced output ordered by the size.
For easier reference, the cell number, size of the output it produced and the first few lines lines of its code are printed. You can skip through the code cells from largest to smallest output by hitting enter :)
Please be aware that you will need to run this script from the command line (otherwise the input()
part won't work)
import nbformat as nbf
from typing import TypedDict
class CodeCellMeta(TypedDict):
cell_num: int
output_size_bytes: int
first_lines: list[str]
def get_code_cell_metadata(nb_path: str):
ntbk = nbf.read(nb_path, nbf.NO_CONVERT)
cell_metas: list[CodeCellMeta] = []
for i, cell in enumerate(ntbk.cells):
cell_num = i + 1
if cell.cell_type == "code":
meta: CodeCellMeta = {
"output_size_bytes": len(str(cell.outputs)),
"cell_num": cell_num,
"first_lines": cell.source.split("\n")[:5],
}
cell_metas.append(meta)
return cell_metas
def human_readable_size(size_bytes: int) -> str:
size_current_unit: float = size_bytes
for unit in ["B", "KB", "MB", "GB", "TB"]:
if size_current_unit < 1024:
return f"{size_current_unit:.2f} {unit}"
size_current_unit /= 1024.0
return f"{size_current_unit:.2f} PB"
def show_large_cells(nb_path: str):
code_cell_meta = get_code_cell_metadata(nb_path)
cell_meta_by_size_est = sorted(
code_cell_meta, key=lambda x: x["output_size_bytes"], reverse=True
)
bytes_remaining = sum([el["output_size_bytes"] for el in cell_meta_by_size_est])
for i, el in enumerate(cell_meta_by_size_est):
print(f"Cell #{el['cell_num']}: {human_readable_size(el['output_size_bytes'])}")
print("\n".join(el["first_lines"]))
print("\n")
bytes_remaining -= el["output_size_bytes"]
if i != len(cell_meta_by_size_est) - 1:
input(
f"Remaining cell outputs account for {human_readable_size(bytes_remaining)} total. Hit enter to view info for next cell."
)
else:
print("No more cells to view.")
if __name__ == "__main__":
import sys
try:
nb_path = sys.argv[1]
if not nb_path.endswith(".ipynb"):
raise ValueError("Please provide a path to a Jupyter notebook file.")
except IndexError:
raise ValueError("Please provide a path to a Jupyter notebook file.")
show_large_cells(nb_path)