nateraw · November 29, 2022 07:40
diff --git a/convert_notebook_to_mdx.py b/convert_notebook_to_mdx.py
 from argparse import ArgumentParser
 from base64 import b64decode
 from io import BytesIO
 from pathlib import Path

 import nbformat
 from doc_builder.style_doc import format_code_example
 from huggingface_hub import create_repo, delete_file, list_repo_files, upload_file
 from PIL import Image


 def has_transparency(img):
    # Borrowed from https://stackoverflow.com/a/58567453
    if img.info.get("transparency", None) is not None:
        return True
    if img.mode == "P":
        transparent = img.info.get("transparency", -1)
        for _, index in img.getcolors():
            if index == transparent:
                return True
    elif img.mode == "RGBA":
        extrema = img.getextrema()
        if extrema[3][0] < 255:
            return True

    return False


 def nb_to_mdx(
    notebook_file: str,
    repo_id: str = "nateraw/test-doc-assets",
    revision: str = None,
    max_len: int = 119,
    dest_file: str = None,
    token: str = None,
    remove_unused_images: bool = True,
 ):
    """Convert a notebook file to MDX format. Stores images in a Hugging Face repo and references them from the MDX file via URLs.

    Args:
        notebook_file (str): Path to notebook file to convert to MDX
        repo_id (str, optional): Hugging Face Repo ID to upload/reference image assets.
        revision (str, optional): Hugging Face Repo Revision to upload/reference image assets.
        max_len (int, optional): Max length of code lines.
        dest_file (str, optional): Path to save output MDX file.
        token (str, optional): Hugging Face Token. Uses stored one by default if available.
        remove_unused_images (bool, optional): Remove unused images from the repo (Recommended).
    """
    # Create destination repo if it doesn't exist
    create_repo(repo_id, exist_ok=True, repo_type="dataset")

    notebook = nbformat.read(notebook_file, as_version=4)
    content = []
    notebook_name = Path(notebook_file).stem
    all_image_filenames = []
    for cell_idx, cell in enumerate(notebook["cells"]):

        if cell["cell_type"] == "code":
            code = cell["source"]

            # Handle input cells, which are just python if cell type is code
            code_lines = code.split("\n")
            code = "\n".join(code_lines)
            code = format_code_example(code, max_len=max_len)[0]
            content.append(f"```python\n{code}\n```")

            # Handle output cells
            for output_idx, output in enumerate(cell["outputs"]):
                if "text" in output and output.get("name", None) == "stdout":
                    output_text = output["text"].strip()
                    output_text = f"```python out\n{output_text}\n```"
                elif "text/plain" in output:
                    output_text = output["text/plain"].strip()
                    output_text = f"```python out\n{output_text}\n```"
                elif "data" in output and "image/png" in output["data"]:
                    # Take image data and turn it to PIL image
                    data = dict(output.pop("data"))
                    im = Image.open(BytesIO(b64decode(data["image/png"])))
                    if not has_transparency(im):
                        im = im.convert("RGB")
                        im.format = "JPEG"

                    # Save image to buffer
                    # TODO - add some image filesize optimization here?
                    temp_fileobj = BytesIO()
                    filename = f"{notebook_name}_cell_{cell_idx}_output_{output_idx}.{im.format.lower()}"
                    temp_fileobj.name = filename
                    all_image_filenames.append(filename)
                    im.save(temp_fileobj)

                    # Upload fileobj to Hugging Face Hub
                    upload_file(
                        path_or_fileobj=temp_fileobj,
                        path_in_repo=filename,  # TODO Should we use relpath/subdir?
                        repo_id=repo_id,
                        revision=revision,
                        token=token,
                        repo_type="dataset",
                    )

                    # Set output image reference to uploaded Hub link (as markdown)
                    img_url = f"https://huggingface.co/datasets/{repo_id}/resolve/{revision or 'main'}/{filename}"
                    output_text = f"![img]({img_url})\n"
                else:
                    continue

                content.append(output_text)

        elif cell["cell_type"] == "markdown":
            content.append(cell["source"])
        else:
            content.append(f"```\n{cell['source']}\n```")

    dest_file = Path(dest_file if dest_file is not None else notebook_file.replace(".ipynb", ".mdx"))
    dest_file.parent.mkdir(exist_ok=True, parents=True)
    dest_file.write_text("\n\n".join(content), encoding="utf-8")

    # If you update the notebook, the image names previously uploaded will be out of sync with the notebook
    # So, you probably will want to delete those.
    # NOTE - would be better to rename here. This is a quick hack.
    if remove_unused_images:
        repo_files = list_repo_files(repo_id=repo_id, revision=revision, repo_type="dataset")
        repo_files = [x for x in repo_files if x.startswith(notebook_name) and x.endswith((".png", ".jpg", ".jpeg"))]
        unused_image_filenames = set(repo_files) - set(all_image_filenames)
        if unused_image_filenames:
            print("\nRemoving the Following Unused/Renamed Image Files From the Hub Repo:")
            print("\t- " + "\n\t- ".join(unused_image_filenames))

            for unused_image_filename in unused_image_filenames:
                delete_file(
                    path_in_repo=unused_image_filename,
                    repo_id=repo_id,
                    token=token,
                    repo_type="dataset",
                )
        else:
            print("No Unused/Renamed Image Files Found")


 def parse_args(args=None):
    parser = ArgumentParser()
    parser.add_argument("--notebook_file", type=str, required=True, help="Path to notebook file to convert to MDX")
    parser.add_argument(
        "--repo_id",
        type=str,
        default="nateraw/test-doc-assets",
        help="Hugging Face Repo ID to upload/reference image assets",
    )
    parser.add_argument(
        "--revision", type=str, default=None, help="Hugging Face Repo Revision to upload/reference image assets"
    )
    parser.add_argument("--max_len", type=int, default=119, help="Max length of code lines")
    parser.add_argument("--dest_file", type=str, default=None, help="Path to save output MDX file")
    parser.add_argument(
        "--token", type=str, default=None, help="Hugging Face Token. Uses stored one by default if available"
    )
    parser.add_argument(
        "--do_not_remove_unused_images",
        action="store_false",
        help="By default we remove unused images from the Hugging Face Hub repo. Provide this flag to NOT do this.",
        dest="remove_unused_images",
    )
    return parser.parse_args(args=args)


 if __name__ == "__main__":
    nb_to_mdx(**vars(parse_args()))
	from argparse import ArgumentParser
	from base64 import b64decode
	from io import BytesIO
	from pathlib import Path

	import nbformat
	from doc_builder.style_doc import format_code_example
	from huggingface_hub import create_repo, delete_file, list_repo_files, upload_file
	from PIL import Image


	def has_transparency(img):
	# Borrowed from https://stackoverflow.com/a/58567453
	if img.info.get("transparency", None) is not None:
	return True
	if img.mode == "P":
	transparent = img.info.get("transparency", -1)
	for _, index in img.getcolors():
	if index == transparent:
	return True
	elif img.mode == "RGBA":
	extrema = img.getextrema()
	if extrema[3][0] < 255:
	return True

	return False


	def nb_to_mdx(
	notebook_file: str,
	repo_id: str = "nateraw/test-doc-assets",
	revision: str = None,
	max_len: int = 119,
	dest_file: str = None,
	token: str = None,
	remove_unused_images: bool = True,
	):
	"""Convert a notebook file to MDX format. Stores images in a Hugging Face repo and references them from the MDX file via URLs.

	Args:
	notebook_file (str): Path to notebook file to convert to MDX
	repo_id (str, optional): Hugging Face Repo ID to upload/reference image assets.
	revision (str, optional): Hugging Face Repo Revision to upload/reference image assets.
	max_len (int, optional): Max length of code lines.
	dest_file (str, optional): Path to save output MDX file.
	token (str, optional): Hugging Face Token. Uses stored one by default if available.
	remove_unused_images (bool, optional): Remove unused images from the repo (Recommended).
	"""
	# Create destination repo if it doesn't exist
	create_repo(repo_id, exist_ok=True, repo_type="dataset")

	notebook = nbformat.read(notebook_file, as_version=4)
	content = []
	notebook_name = Path(notebook_file).stem
	all_image_filenames = []
	for cell_idx, cell in enumerate(notebook["cells"]):

	if cell["cell_type"] == "code":
	code = cell["source"]

	# Handle input cells, which are just python if cell type is code
	code_lines = code.split("\n")
	code = "\n".join(code_lines)
	code = format_code_example(code, max_len=max_len)[0]
	content.append(f"```python\n{code}\n```")

	# Handle output cells
	for output_idx, output in enumerate(cell["outputs"]):
	if "text" in output and output.get("name", None) == "stdout":
	output_text = output["text"].strip()
	output_text = f"```python out\n{output_text}\n```"
	elif "text/plain" in output:
	output_text = output["text/plain"].strip()
	output_text = f"```python out\n{output_text}\n```"
	elif "data" in output and "image/png" in output["data"]:
	# Take image data and turn it to PIL image
	data = dict(output.pop("data"))
	im = Image.open(BytesIO(b64decode(data["image/png"])))
	if not has_transparency(im):
	im = im.convert("RGB")
	im.format = "JPEG"

	# Save image to buffer
	# TODO - add some image filesize optimization here?
	temp_fileobj = BytesIO()
	filename = f"{notebook_name}_cell_{cell_idx}_output_{output_idx}.{im.format.lower()}"
	temp_fileobj.name = filename
	all_image_filenames.append(filename)
	im.save(temp_fileobj)

	# Upload fileobj to Hugging Face Hub
	upload_file(
	path_or_fileobj=temp_fileobj,
	path_in_repo=filename, # TODO Should we use relpath/subdir?
	repo_id=repo_id,
	revision=revision,
	token=token,
	repo_type="dataset",
	)

	# Set output image reference to uploaded Hub link (as markdown)
	img_url = f"https://huggingface.co/datasets/{repo_id}/resolve/{revision or 'main'}/{filename}"
	output_text = f"![img]({img_url})\n"
	else:
	continue

	content.append(output_text)

	elif cell["cell_type"] == "markdown":
	content.append(cell["source"])
	else:
	content.append(f"```\n{cell['source']}\n```")

	dest_file = Path(dest_file if dest_file is not None else notebook_file.replace(".ipynb", ".mdx"))
	dest_file.parent.mkdir(exist_ok=True, parents=True)
	dest_file.write_text("\n\n".join(content), encoding="utf-8")

	# If you update the notebook, the image names previously uploaded will be out of sync with the notebook
	# So, you probably will want to delete those.
	# NOTE - would be better to rename here. This is a quick hack.
	if remove_unused_images:
	repo_files = list_repo_files(repo_id=repo_id, revision=revision, repo_type="dataset")
	repo_files = [x for x in repo_files if x.startswith(notebook_name) and x.endswith((".png", ".jpg", ".jpeg"))]
	unused_image_filenames = set(repo_files) - set(all_image_filenames)
	if unused_image_filenames:
	print("\nRemoving the Following Unused/Renamed Image Files From the Hub Repo:")
	print("\t- " + "\n\t- ".join(unused_image_filenames))

	for unused_image_filename in unused_image_filenames:
	delete_file(
	path_in_repo=unused_image_filename,
	repo_id=repo_id,
	token=token,
	repo_type="dataset",
	)
	else:
	print("No Unused/Renamed Image Files Found")


	def parse_args(args=None):
	parser = ArgumentParser()
	parser.add_argument("--notebook_file", type=str, required=True, help="Path to notebook file to convert to MDX")
	parser.add_argument(
	"--repo_id",
	type=str,
	default="nateraw/test-doc-assets",
	help="Hugging Face Repo ID to upload/reference image assets",
	)
	parser.add_argument(
	"--revision", type=str, default=None, help="Hugging Face Repo Revision to upload/reference image assets"
	)
	parser.add_argument("--max_len", type=int, default=119, help="Max length of code lines")
	parser.add_argument("--dest_file", type=str, default=None, help="Path to save output MDX file")
	parser.add_argument(
	"--token", type=str, default=None, help="Hugging Face Token. Uses stored one by default if available"
	)
	parser.add_argument(
	"--do_not_remove_unused_images",
	action="store_false",
	help="By default we remove unused images from the Hugging Face Hub repo. Provide this flag to NOT do this.",
	dest="remove_unused_images",
	)
	return parser.parse_args(args=args)


	if __name__ == "__main__":
	nb_to_mdx(**vars(parse_args()))