Skip to content

Instantly share code, notes, and snippets.

@nateraw
Created November 29, 2022 07:40
Show Gist options
  • Save nateraw/6106b0687255a404b20892235462b099 to your computer and use it in GitHub Desktop.
Save nateraw/6106b0687255a404b20892235462b099 to your computer and use it in GitHub Desktop.
from argparse import ArgumentParser
from base64 import b64decode
from io import BytesIO
from pathlib import Path
import nbformat
from doc_builder.style_doc import format_code_example
from huggingface_hub import create_repo, delete_file, list_repo_files, upload_file
from PIL import Image
def has_transparency(img):
# Borrowed from https://stackoverflow.com/a/58567453
if img.info.get("transparency", None) is not None:
return True
if img.mode == "P":
transparent = img.info.get("transparency", -1)
for _, index in img.getcolors():
if index == transparent:
return True
elif img.mode == "RGBA":
extrema = img.getextrema()
if extrema[3][0] < 255:
return True
return False
def nb_to_mdx(
notebook_file: str,
repo_id: str = "nateraw/test-doc-assets",
revision: str = None,
max_len: int = 119,
dest_file: str = None,
token: str = None,
remove_unused_images: bool = True,
):
"""Convert a notebook file to MDX format. Stores images in a Hugging Face repo and references them from the MDX file via URLs.
Args:
notebook_file (str): Path to notebook file to convert to MDX
repo_id (str, optional): Hugging Face Repo ID to upload/reference image assets.
revision (str, optional): Hugging Face Repo Revision to upload/reference image assets.
max_len (int, optional): Max length of code lines.
dest_file (str, optional): Path to save output MDX file.
token (str, optional): Hugging Face Token. Uses stored one by default if available.
remove_unused_images (bool, optional): Remove unused images from the repo (Recommended).
"""
# Create destination repo if it doesn't exist
create_repo(repo_id, exist_ok=True, repo_type="dataset")
notebook = nbformat.read(notebook_file, as_version=4)
content = []
notebook_name = Path(notebook_file).stem
all_image_filenames = []
for cell_idx, cell in enumerate(notebook["cells"]):
if cell["cell_type"] == "code":
code = cell["source"]
# Handle input cells, which are just python if cell type is code
code_lines = code.split("\n")
code = "\n".join(code_lines)
code = format_code_example(code, max_len=max_len)[0]
content.append(f"```python\n{code}\n```")
# Handle output cells
for output_idx, output in enumerate(cell["outputs"]):
if "text" in output and output.get("name", None) == "stdout":
output_text = output["text"].strip()
output_text = f"```python out\n{output_text}\n```"
elif "text/plain" in output:
output_text = output["text/plain"].strip()
output_text = f"```python out\n{output_text}\n```"
elif "data" in output and "image/png" in output["data"]:
# Take image data and turn it to PIL image
data = dict(output.pop("data"))
im = Image.open(BytesIO(b64decode(data["image/png"])))
if not has_transparency(im):
im = im.convert("RGB")
im.format = "JPEG"
# Save image to buffer
# TODO - add some image filesize optimization here?
temp_fileobj = BytesIO()
filename = f"{notebook_name}_cell_{cell_idx}_output_{output_idx}.{im.format.lower()}"
temp_fileobj.name = filename
all_image_filenames.append(filename)
im.save(temp_fileobj)
# Upload fileobj to Hugging Face Hub
upload_file(
path_or_fileobj=temp_fileobj,
path_in_repo=filename, # TODO Should we use relpath/subdir?
repo_id=repo_id,
revision=revision,
token=token,
repo_type="dataset",
)
# Set output image reference to uploaded Hub link (as markdown)
img_url = f"https://huggingface.co/datasets/{repo_id}/resolve/{revision or 'main'}/{filename}"
output_text = f"![img]({img_url})\n"
else:
continue
content.append(output_text)
elif cell["cell_type"] == "markdown":
content.append(cell["source"])
else:
content.append(f"```\n{cell['source']}\n```")
dest_file = Path(dest_file if dest_file is not None else notebook_file.replace(".ipynb", ".mdx"))
dest_file.parent.mkdir(exist_ok=True, parents=True)
dest_file.write_text("\n\n".join(content), encoding="utf-8")
# If you update the notebook, the image names previously uploaded will be out of sync with the notebook
# So, you probably will want to delete those.
# NOTE - would be better to rename here. This is a quick hack.
if remove_unused_images:
repo_files = list_repo_files(repo_id=repo_id, revision=revision, repo_type="dataset")
repo_files = [x for x in repo_files if x.startswith(notebook_name) and x.endswith((".png", ".jpg", ".jpeg"))]
unused_image_filenames = set(repo_files) - set(all_image_filenames)
if unused_image_filenames:
print("\nRemoving the Following Unused/Renamed Image Files From the Hub Repo:")
print("\t- " + "\n\t- ".join(unused_image_filenames))
for unused_image_filename in unused_image_filenames:
delete_file(
path_in_repo=unused_image_filename,
repo_id=repo_id,
token=token,
repo_type="dataset",
)
else:
print("No Unused/Renamed Image Files Found")
def parse_args(args=None):
parser = ArgumentParser()
parser.add_argument("--notebook_file", type=str, required=True, help="Path to notebook file to convert to MDX")
parser.add_argument(
"--repo_id",
type=str,
default="nateraw/test-doc-assets",
help="Hugging Face Repo ID to upload/reference image assets",
)
parser.add_argument(
"--revision", type=str, default=None, help="Hugging Face Repo Revision to upload/reference image assets"
)
parser.add_argument("--max_len", type=int, default=119, help="Max length of code lines")
parser.add_argument("--dest_file", type=str, default=None, help="Path to save output MDX file")
parser.add_argument(
"--token", type=str, default=None, help="Hugging Face Token. Uses stored one by default if available"
)
parser.add_argument(
"--do_not_remove_unused_images",
action="store_false",
help="By default we remove unused images from the Hugging Face Hub repo. Provide this flag to NOT do this.",
dest="remove_unused_images",
)
return parser.parse_args(args=args)
if __name__ == "__main__":
nb_to_mdx(**vars(parse_args()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment