Skip to content

Instantly share code, notes, and snippets.

@ammmir
Created March 13, 2025 01:52
Show Gist options
  • Save ammmir/74723848040703fec4cb408d003ef815 to your computer and use it in GitHub Desktop.
Save ammmir/74723848040703fec4cb408d003ef815 to your computer and use it in GitHub Desktop.
import streamlit as st
import pymupdf
from PIL import Image
import io
import re
st.set_page_config(page_title="PDF Debug Viewer", layout="wide")
st.title("PDF Debug Viewer")
DEFAULT_SPLIT_REGEX = r'(?<=[.!?])\s+' # Matches space after sentence-ending punctuation
def pixmap_to_image(pixmap):
"""Convert PyMuPDF pixmap to PIL Image"""
return Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
def extract_text_blocks(page, sort_order=True):
"""Extract text blocks from a page, optionally preserving raw order."""
blocks = page.get_text("blocks") # Extract text with positions
if sort_order:
blocks.sort(key=lambda b: (b[1], b[0])) # Top-to-bottom, then left-to-right
return blocks # Returns block list with (x0, y0, x1, y1, text)
def chunk_text(blocks, split_regex, chunks_per_group):
"""Splits text into chunks while keeping block positions."""
sentences = []
block_positions = []
for block in blocks:
text = block[4].strip()
if text:
split_text = re.split(split_regex, text)
split_text = [s.strip() for s in split_text if s.strip()]
sentences.extend(split_text)
block_positions.extend([block[:4]] * len(split_text)) # Keep block coordinates
chunks = []
chunk_positions = []
for i in range(0, len(sentences), chunks_per_group):
chunk = ' '.join(sentences[i:i + chunks_per_group])
chunk_rects = block_positions[i:i + chunks_per_group]
chunks.append((chunk, chunk_rects))
return chunks
def highlight_chunks(page, chunks, zoom=2.0):
"""Highlights the chunks with distinct colors."""
mat = pymupdf.Matrix(zoom, zoom)
colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1), (1, 1, 0), (1, 0, 1), (0, 1, 1)]
annots = []
for idx, (chunk_text, chunk_rects) in enumerate(chunks):
color = colors[idx % len(colors)]
for rect in chunk_rects:
rect_annot = page.add_rect_annot(rect)
rect_annot.set_colors(stroke=color)
rect_annot.set_border(width=2)
rect_annot.update()
annots.append(rect_annot)
first_rect = chunk_rects[0]
text_annot = page.add_freetext_annot(
rect=(first_rect[2] + 5, first_rect[1], first_rect[2] + 25, first_rect[3]),
text=str(idx + 1),
fontsize=12,
text_color=color
)
annots.append(text_annot)
pix = page.get_pixmap(matrix=mat, alpha=False)
for annot in annots:
page.delete_annot(annot)
return pix
# File uploader
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file is not None:
pdfCol1, pdfCol2 = st.columns(2)
pdf_bytes = io.BytesIO(uploaded_file.read())
doc = pymupdf.open(stream=pdf_bytes)
with pdfCol1:
st.header("Page Viewer")
page_num = st.slider("Select page", min_value=1, max_value=len(doc), value=1) - 1
page = doc[page_num]
with pdfCol2:
st.header("PDF Metadata")
metadata = doc.metadata
st.json(metadata)
col1, col2 = st.columns(2)
# Extract text blocks (unordered and ordered)
unordered_blocks = extract_text_blocks(page, sort_order=False)
ordered_blocks = extract_text_blocks(page, sort_order=True)
unordered_text = "\n".join([b[4] for b in unordered_blocks])
ordered_text = "\n".join([b[4] for b in ordered_blocks])
# Add toggle for viewing unordered chunks
show_unordered = st.checkbox("Show Unordered Chunks (Raw Text Order)")
# Choose which version of blocks to process
selected_blocks = unordered_blocks if show_unordered else ordered_blocks
st.subheader("RAW TEXT REAL")
st.code(page.get_textpage().extractTEXT(sort=True), language="text")
with col2:
st.subheader("Raw Text")
st.subheader("Text Chunking")
split_regex = st.text_input("Split regex:", DEFAULT_SPLIT_REGEX)
chunks_per_group = st.number_input("Sentences per chunk:", min_value=1, value=3)
chunks = chunk_text(selected_blocks, split_regex, chunks_per_group)
st.subheader(f"Chunks ({len(chunks)})")
if chunks:
current_chunk = st.slider("Select a Chunk", min_value=1, max_value=len(chunks), value=1, step=1)
st.code(chunks[current_chunk-1][0], language="text")
st.subheader("Full Text")
st.code(unordered_text if show_unordered else ordered_text, language="text")
with col1:
st.subheader("Page Image")
zoom = 2.0
viz_mode = st.radio("Visualization mode:", ["Show Chunks", "Normal"])
if viz_mode == "Show Chunks":
pix = highlight_chunks(page, chunks, zoom)
else:
mat = pymupdf.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False)
img = pixmap_to_image(pix)
st.image(img, use_container_width=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment