Created
November 29, 2022 07:40
-
-
Save nateraw/6106b0687255a404b20892235462b099 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from argparse import ArgumentParser | |
from base64 import b64decode | |
from io import BytesIO | |
from pathlib import Path | |
import nbformat | |
from doc_builder.style_doc import format_code_example | |
from huggingface_hub import create_repo, delete_file, list_repo_files, upload_file | |
from PIL import Image | |
def has_transparency(img): | |
# Borrowed from https://stackoverflow.com/a/58567453 | |
if img.info.get("transparency", None) is not None: | |
return True | |
if img.mode == "P": | |
transparent = img.info.get("transparency", -1) | |
for _, index in img.getcolors(): | |
if index == transparent: | |
return True | |
elif img.mode == "RGBA": | |
extrema = img.getextrema() | |
if extrema[3][0] < 255: | |
return True | |
return False | |
def nb_to_mdx( | |
notebook_file: str, | |
repo_id: str = "nateraw/test-doc-assets", | |
revision: str = None, | |
max_len: int = 119, | |
dest_file: str = None, | |
token: str = None, | |
remove_unused_images: bool = True, | |
): | |
"""Convert a notebook file to MDX format. Stores images in a Hugging Face repo and references them from the MDX file via URLs. | |
Args: | |
notebook_file (str): Path to notebook file to convert to MDX | |
repo_id (str, optional): Hugging Face Repo ID to upload/reference image assets. | |
revision (str, optional): Hugging Face Repo Revision to upload/reference image assets. | |
max_len (int, optional): Max length of code lines. | |
dest_file (str, optional): Path to save output MDX file. | |
token (str, optional): Hugging Face Token. Uses stored one by default if available. | |
remove_unused_images (bool, optional): Remove unused images from the repo (Recommended). | |
""" | |
# Create destination repo if it doesn't exist | |
create_repo(repo_id, exist_ok=True, repo_type="dataset") | |
notebook = nbformat.read(notebook_file, as_version=4) | |
content = [] | |
notebook_name = Path(notebook_file).stem | |
all_image_filenames = [] | |
for cell_idx, cell in enumerate(notebook["cells"]): | |
if cell["cell_type"] == "code": | |
code = cell["source"] | |
# Handle input cells, which are just python if cell type is code | |
code_lines = code.split("\n") | |
code = "\n".join(code_lines) | |
code = format_code_example(code, max_len=max_len)[0] | |
content.append(f"```python\n{code}\n```") | |
# Handle output cells | |
for output_idx, output in enumerate(cell["outputs"]): | |
if "text" in output and output.get("name", None) == "stdout": | |
output_text = output["text"].strip() | |
output_text = f"```python out\n{output_text}\n```" | |
elif "text/plain" in output: | |
output_text = output["text/plain"].strip() | |
output_text = f"```python out\n{output_text}\n```" | |
elif "data" in output and "image/png" in output["data"]: | |
# Take image data and turn it to PIL image | |
data = dict(output.pop("data")) | |
im = Image.open(BytesIO(b64decode(data["image/png"]))) | |
if not has_transparency(im): | |
im = im.convert("RGB") | |
im.format = "JPEG" | |
# Save image to buffer | |
# TODO - add some image filesize optimization here? | |
temp_fileobj = BytesIO() | |
filename = f"{notebook_name}_cell_{cell_idx}_output_{output_idx}.{im.format.lower()}" | |
temp_fileobj.name = filename | |
all_image_filenames.append(filename) | |
im.save(temp_fileobj) | |
# Upload fileobj to Hugging Face Hub | |
upload_file( | |
path_or_fileobj=temp_fileobj, | |
path_in_repo=filename, # TODO Should we use relpath/subdir? | |
repo_id=repo_id, | |
revision=revision, | |
token=token, | |
repo_type="dataset", | |
) | |
# Set output image reference to uploaded Hub link (as markdown) | |
img_url = f"https://huggingface.co/datasets/{repo_id}/resolve/{revision or 'main'}/{filename}" | |
output_text = f"\n" | |
else: | |
continue | |
content.append(output_text) | |
elif cell["cell_type"] == "markdown": | |
content.append(cell["source"]) | |
else: | |
content.append(f"```\n{cell['source']}\n```") | |
dest_file = Path(dest_file if dest_file is not None else notebook_file.replace(".ipynb", ".mdx")) | |
dest_file.parent.mkdir(exist_ok=True, parents=True) | |
dest_file.write_text("\n\n".join(content), encoding="utf-8") | |
# If you update the notebook, the image names previously uploaded will be out of sync with the notebook | |
# So, you probably will want to delete those. | |
# NOTE - would be better to rename here. This is a quick hack. | |
if remove_unused_images: | |
repo_files = list_repo_files(repo_id=repo_id, revision=revision, repo_type="dataset") | |
repo_files = [x for x in repo_files if x.startswith(notebook_name) and x.endswith((".png", ".jpg", ".jpeg"))] | |
unused_image_filenames = set(repo_files) - set(all_image_filenames) | |
if unused_image_filenames: | |
print("\nRemoving the Following Unused/Renamed Image Files From the Hub Repo:") | |
print("\t- " + "\n\t- ".join(unused_image_filenames)) | |
for unused_image_filename in unused_image_filenames: | |
delete_file( | |
path_in_repo=unused_image_filename, | |
repo_id=repo_id, | |
token=token, | |
repo_type="dataset", | |
) | |
else: | |
print("No Unused/Renamed Image Files Found") | |
def parse_args(args=None): | |
parser = ArgumentParser() | |
parser.add_argument("--notebook_file", type=str, required=True, help="Path to notebook file to convert to MDX") | |
parser.add_argument( | |
"--repo_id", | |
type=str, | |
default="nateraw/test-doc-assets", | |
help="Hugging Face Repo ID to upload/reference image assets", | |
) | |
parser.add_argument( | |
"--revision", type=str, default=None, help="Hugging Face Repo Revision to upload/reference image assets" | |
) | |
parser.add_argument("--max_len", type=int, default=119, help="Max length of code lines") | |
parser.add_argument("--dest_file", type=str, default=None, help="Path to save output MDX file") | |
parser.add_argument( | |
"--token", type=str, default=None, help="Hugging Face Token. Uses stored one by default if available" | |
) | |
parser.add_argument( | |
"--do_not_remove_unused_images", | |
action="store_false", | |
help="By default we remove unused images from the Hugging Face Hub repo. Provide this flag to NOT do this.", | |
dest="remove_unused_images", | |
) | |
return parser.parse_args(args=args) | |
if __name__ == "__main__": | |
nb_to_mdx(**vars(parse_args())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment