Created
March 13, 2025 01:52
-
-
Save ammmir/74723848040703fec4cb408d003ef815 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import streamlit as st | |
import pymupdf | |
from PIL import Image | |
import io | |
import re | |
st.set_page_config(page_title="PDF Debug Viewer", layout="wide") | |
st.title("PDF Debug Viewer") | |
DEFAULT_SPLIT_REGEX = r'(?<=[.!?])\s+' # Matches space after sentence-ending punctuation | |
def pixmap_to_image(pixmap): | |
"""Convert PyMuPDF pixmap to PIL Image""" | |
return Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples) | |
def extract_text_blocks(page, sort_order=True): | |
"""Extract text blocks from a page, optionally preserving raw order.""" | |
blocks = page.get_text("blocks") # Extract text with positions | |
if sort_order: | |
blocks.sort(key=lambda b: (b[1], b[0])) # Top-to-bottom, then left-to-right | |
return blocks # Returns block list with (x0, y0, x1, y1, text) | |
def chunk_text(blocks, split_regex, chunks_per_group): | |
"""Splits text into chunks while keeping block positions.""" | |
sentences = [] | |
block_positions = [] | |
for block in blocks: | |
text = block[4].strip() | |
if text: | |
split_text = re.split(split_regex, text) | |
split_text = [s.strip() for s in split_text if s.strip()] | |
sentences.extend(split_text) | |
block_positions.extend([block[:4]] * len(split_text)) # Keep block coordinates | |
chunks = [] | |
chunk_positions = [] | |
for i in range(0, len(sentences), chunks_per_group): | |
chunk = ' '.join(sentences[i:i + chunks_per_group]) | |
chunk_rects = block_positions[i:i + chunks_per_group] | |
chunks.append((chunk, chunk_rects)) | |
return chunks | |
def highlight_chunks(page, chunks, zoom=2.0): | |
"""Highlights the chunks with distinct colors.""" | |
mat = pymupdf.Matrix(zoom, zoom) | |
colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1), (1, 1, 0), (1, 0, 1), (0, 1, 1)] | |
annots = [] | |
for idx, (chunk_text, chunk_rects) in enumerate(chunks): | |
color = colors[idx % len(colors)] | |
for rect in chunk_rects: | |
rect_annot = page.add_rect_annot(rect) | |
rect_annot.set_colors(stroke=color) | |
rect_annot.set_border(width=2) | |
rect_annot.update() | |
annots.append(rect_annot) | |
first_rect = chunk_rects[0] | |
text_annot = page.add_freetext_annot( | |
rect=(first_rect[2] + 5, first_rect[1], first_rect[2] + 25, first_rect[3]), | |
text=str(idx + 1), | |
fontsize=12, | |
text_color=color | |
) | |
annots.append(text_annot) | |
pix = page.get_pixmap(matrix=mat, alpha=False) | |
for annot in annots: | |
page.delete_annot(annot) | |
return pix | |
# File uploader | |
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") | |
if uploaded_file is not None: | |
pdfCol1, pdfCol2 = st.columns(2) | |
pdf_bytes = io.BytesIO(uploaded_file.read()) | |
doc = pymupdf.open(stream=pdf_bytes) | |
with pdfCol1: | |
st.header("Page Viewer") | |
page_num = st.slider("Select page", min_value=1, max_value=len(doc), value=1) - 1 | |
page = doc[page_num] | |
with pdfCol2: | |
st.header("PDF Metadata") | |
metadata = doc.metadata | |
st.json(metadata) | |
col1, col2 = st.columns(2) | |
# Extract text blocks (unordered and ordered) | |
unordered_blocks = extract_text_blocks(page, sort_order=False) | |
ordered_blocks = extract_text_blocks(page, sort_order=True) | |
unordered_text = "\n".join([b[4] for b in unordered_blocks]) | |
ordered_text = "\n".join([b[4] for b in ordered_blocks]) | |
# Add toggle for viewing unordered chunks | |
show_unordered = st.checkbox("Show Unordered Chunks (Raw Text Order)") | |
# Choose which version of blocks to process | |
selected_blocks = unordered_blocks if show_unordered else ordered_blocks | |
st.subheader("RAW TEXT REAL") | |
st.code(page.get_textpage().extractTEXT(sort=True), language="text") | |
with col2: | |
st.subheader("Raw Text") | |
st.subheader("Text Chunking") | |
split_regex = st.text_input("Split regex:", DEFAULT_SPLIT_REGEX) | |
chunks_per_group = st.number_input("Sentences per chunk:", min_value=1, value=3) | |
chunks = chunk_text(selected_blocks, split_regex, chunks_per_group) | |
st.subheader(f"Chunks ({len(chunks)})") | |
if chunks: | |
current_chunk = st.slider("Select a Chunk", min_value=1, max_value=len(chunks), value=1, step=1) | |
st.code(chunks[current_chunk-1][0], language="text") | |
st.subheader("Full Text") | |
st.code(unordered_text if show_unordered else ordered_text, language="text") | |
with col1: | |
st.subheader("Page Image") | |
zoom = 2.0 | |
viz_mode = st.radio("Visualization mode:", ["Show Chunks", "Normal"]) | |
if viz_mode == "Show Chunks": | |
pix = highlight_chunks(page, chunks, zoom) | |
else: | |
mat = pymupdf.Matrix(zoom, zoom) | |
pix = page.get_pixmap(matrix=mat, alpha=False) | |
img = pixmap_to_image(pix) | |
st.image(img, use_container_width=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment