Last active
November 14, 2024 05:48
-
-
Save cpfiffer/cde89f4769140de77e8d00a906fd7eb3 to your computer and use it in GitHub Desktop.
Get structured output from PDFs. Goes through a PDF one page at a time -- it is not currently build for multiple pages, but could be extended as needed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
pip install outlines torch==2.4.0 transformers accelerate typing-extensions pillow pdf2image rich requests | |
may need to install tkinter: https://stackoverflow.com/questions/25905540/importerror-no-module-named-tkinter | |
sudo apt-get install poppler-utils | |
""" | |
from enum import Enum | |
from io import BytesIO | |
from PIL import Image | |
from urllib.request import urlopen | |
import outlines | |
import torch | |
from transformers import ( | |
LlavaForConditionalGeneration, | |
) | |
from pydantic import BaseModel, Field, confloat, constr | |
from pydantic.types import StringConstraints | |
from typing import List | |
from typing_extensions import Annotated | |
from pdf2image import convert_from_path | |
import os | |
from typing import List, Optional | |
from rich import print | |
import requests | |
model_name="mistral-community/pixtral-12b" # original magnet model is able to be loaded without issue | |
model_class=LlavaForConditionalGeneration | |
model_kwargs = { | |
"torch_dtype": torch.bfloat16, | |
"device_map": "auto", | |
} | |
processor_kwargs = { | |
"device": "cuda", | |
} | |
model = outlines.models.transformers_vision( | |
model_name, | |
model_class=model_class, | |
model_kwargs=model_kwargs, | |
processor_kwargs=processor_kwargs, | |
) | |
def convert_pdf_to_images( | |
pdf_path: str, | |
output_dir: Optional[str] = None, | |
dpi: int = 20, | |
fmt: str = 'PNG' | |
) -> List[Image.Image]: | |
""" | |
Convert a PDF file to a list of PIL Image objects. | |
Args: | |
pdf_path: Path to the PDF file | |
output_dir: Optional directory to save the images | |
dpi: Resolution for the conversion (200 is good for vision models) | |
fmt: Output format (PNG recommended for quality) | |
Returns: | |
List of PIL Image objects | |
""" | |
# Convert PDF to list of images | |
images = convert_from_path( | |
pdf_path, | |
dpi=dpi, | |
fmt=fmt | |
) | |
# Optionally save images | |
if output_dir: | |
os.makedirs(output_dir, exist_ok=True) | |
for i, image in enumerate(images): | |
image.save(os.path.join(output_dir, f'page_{i+1}.{fmt.lower()}')) | |
return images | |
# Download the louf-willard pdf | |
# https://arxiv.org/pdf/2307.09702 | |
# Download the PDF file | |
pdf_url = "https://arxiv.org/pdf/2307.09702" | |
response = requests.get(pdf_url) | |
# Save the PDF locally | |
with open("louf-willard.pdf", "wb") as f: | |
f.write(response.content) | |
# Load the louf-willard pdf | |
images = convert_pdf_to_images("louf-willard.pdf", dpi=80, output_dir="output_images") | |
class PageSummary(BaseModel): | |
description: str | |
key_takeaways: List[str] | |
page_number: int | |
page_summary_generator = outlines.generate.json(model, PageSummary) | |
instruction = f""" | |
<s>[INST] | |
You are an expert at summarizing pages from a scientific paper. | |
Please summarize the page. | |
Your schema is: | |
{PageSummary.model_json_schema()} | |
Please extract | |
[IMG][/INST] | |
""".strip() | |
for image in images: | |
page_summary = page_summary_generator(instruction, [image]) | |
print(page_summary) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Some example output. It's not always quite right but hey at least it works.