ammmir · March 13, 2025 01:52
diff --git a/pdf-debug.py b/pdf-debug.py
 import streamlit as st
 import pymupdf
 from PIL import Image
 import io
 import re

 st.set_page_config(page_title="PDF Debug Viewer", layout="wide")
 st.title("PDF Debug Viewer")

 DEFAULT_SPLIT_REGEX = r'(?<=[.!?])\s+'  # Matches space after sentence-ending punctuation

 def pixmap_to_image(pixmap):
    """Convert PyMuPDF pixmap to PIL Image"""
    return Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)

 def extract_text_blocks(page, sort_order=True):
    """Extract text blocks from a page, optionally preserving raw order."""
    blocks = page.get_text("blocks")  # Extract text with positions
    if sort_order:
        blocks.sort(key=lambda b: (b[1], b[0]))  # Top-to-bottom, then left-to-right
    return blocks  # Returns block list with (x0, y0, x1, y1, text)

 def chunk_text(blocks, split_regex, chunks_per_group):
    """Splits text into chunks while keeping block positions."""
    sentences = []
    block_positions = []
    
    for block in blocks:
        text = block[4].strip()
        if text:
            split_text = re.split(split_regex, text)
            split_text = [s.strip() for s in split_text if s.strip()]
            sentences.extend(split_text)
            block_positions.extend([block[:4]] * len(split_text))  # Keep block coordinates

    chunks = []
    chunk_positions = []
    
    for i in range(0, len(sentences), chunks_per_group):
        chunk = ' '.join(sentences[i:i + chunks_per_group])
        chunk_rects = block_positions[i:i + chunks_per_group]
        chunks.append((chunk, chunk_rects))

    return chunks

 def highlight_chunks(page, chunks, zoom=2.0):
    """Highlights the chunks with distinct colors."""
    mat = pymupdf.Matrix(zoom, zoom)
    colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1), (1, 1, 0), (1, 0, 1), (0, 1, 1)]
    
    annots = []
    
    for idx, (chunk_text, chunk_rects) in enumerate(chunks):
        color = colors[idx % len(colors)]
        
        for rect in chunk_rects:
            rect_annot = page.add_rect_annot(rect)
            rect_annot.set_colors(stroke=color)
            rect_annot.set_border(width=2)
            rect_annot.update()
            annots.append(rect_annot)

        first_rect = chunk_rects[0]
        text_annot = page.add_freetext_annot(
            rect=(first_rect[2] + 5, first_rect[1], first_rect[2] + 25, first_rect[3]),
            text=str(idx + 1),
            fontsize=12,
            text_color=color
        )
        annots.append(text_annot)
    
    pix = page.get_pixmap(matrix=mat, alpha=False)

    for annot in annots:
        page.delete_annot(annot)

    return pix

 # File uploader
 uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

 if uploaded_file is not None:
    pdfCol1, pdfCol2 = st.columns(2)

    pdf_bytes = io.BytesIO(uploaded_file.read())
    doc = pymupdf.open(stream=pdf_bytes)

    with pdfCol1:
        st.header("Page Viewer")
        page_num = st.slider("Select page", min_value=1, max_value=len(doc), value=1) - 1
        page = doc[page_num]

    with pdfCol2:
        st.header("PDF Metadata")
        metadata = doc.metadata
        st.json(metadata)

    col1, col2 = st.columns(2)

    # Extract text blocks (unordered and ordered)
    unordered_blocks = extract_text_blocks(page, sort_order=False)
    ordered_blocks = extract_text_blocks(page, sort_order=True)

    unordered_text = "\n".join([b[4] for b in unordered_blocks])
    ordered_text = "\n".join([b[4] for b in ordered_blocks])

    # Add toggle for viewing unordered chunks
    show_unordered = st.checkbox("Show Unordered Chunks (Raw Text Order)")

    # Choose which version of blocks to process
    selected_blocks = unordered_blocks if show_unordered else ordered_blocks

    st.subheader("RAW TEXT REAL")
    st.code(page.get_textpage().extractTEXT(sort=True), language="text")

    with col2:
        st.subheader("Raw Text")
        st.subheader("Text Chunking")
        split_regex = st.text_input("Split regex:", DEFAULT_SPLIT_REGEX)
        chunks_per_group = st.number_input("Sentences per chunk:", min_value=1, value=3)

        chunks = chunk_text(selected_blocks, split_regex, chunks_per_group)
        st.subheader(f"Chunks ({len(chunks)})")

        if chunks:
            current_chunk = st.slider("Select a Chunk", min_value=1, max_value=len(chunks), value=1, step=1)
            st.code(chunks[current_chunk-1][0], language="text")

        st.subheader("Full Text")
        st.code(unordered_text if show_unordered else ordered_text, language="text")

    with col1:
        st.subheader("Page Image")
        zoom = 2.0

        viz_mode = st.radio("Visualization mode:", ["Show Chunks", "Normal"])

        if viz_mode == "Show Chunks":
            pix = highlight_chunks(page, chunks, zoom)
        else:
            mat = pymupdf.Matrix(zoom, zoom)
            pix = page.get_pixmap(matrix=mat, alpha=False)

        img = pixmap_to_image(pix)
        st.image(img, use_container_width=True)
	import streamlit as st
	import pymupdf
	from PIL import Image
	import io
	import re

	st.set_page_config(page_title="PDF Debug Viewer", layout="wide")
	st.title("PDF Debug Viewer")

	DEFAULT_SPLIT_REGEX = r'(?<=[.!?])\s+' # Matches space after sentence-ending punctuation

	def pixmap_to_image(pixmap):
	"""Convert PyMuPDF pixmap to PIL Image"""
	return Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)

	def extract_text_blocks(page, sort_order=True):
	"""Extract text blocks from a page, optionally preserving raw order."""
	blocks = page.get_text("blocks") # Extract text with positions
	if sort_order:
	blocks.sort(key=lambda b: (b[1], b[0])) # Top-to-bottom, then left-to-right
	return blocks # Returns block list with (x0, y0, x1, y1, text)

	def chunk_text(blocks, split_regex, chunks_per_group):
	"""Splits text into chunks while keeping block positions."""
	sentences = []
	block_positions = []

	for block in blocks:
	text = block[4].strip()
	if text:
	split_text = re.split(split_regex, text)
	split_text = [s.strip() for s in split_text if s.strip()]
	sentences.extend(split_text)
	block_positions.extend([block[:4]] * len(split_text)) # Keep block coordinates

	chunks = []
	chunk_positions = []

	for i in range(0, len(sentences), chunks_per_group):
	chunk = ' '.join(sentences[i:i + chunks_per_group])
	chunk_rects = block_positions[i:i + chunks_per_group]
	chunks.append((chunk, chunk_rects))

	return chunks

	def highlight_chunks(page, chunks, zoom=2.0):
	"""Highlights the chunks with distinct colors."""
	mat = pymupdf.Matrix(zoom, zoom)
	colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1), (1, 1, 0), (1, 0, 1), (0, 1, 1)]

	annots = []

	for idx, (chunk_text, chunk_rects) in enumerate(chunks):
	color = colors[idx % len(colors)]

	for rect in chunk_rects:
	rect_annot = page.add_rect_annot(rect)
	rect_annot.set_colors(stroke=color)
	rect_annot.set_border(width=2)
	rect_annot.update()
	annots.append(rect_annot)

	first_rect = chunk_rects[0]
	text_annot = page.add_freetext_annot(
	rect=(first_rect[2] + 5, first_rect[1], first_rect[2] + 25, first_rect[3]),
	text=str(idx + 1),
	fontsize=12,
	text_color=color
	)
	annots.append(text_annot)

	pix = page.get_pixmap(matrix=mat, alpha=False)

	for annot in annots:
	page.delete_annot(annot)

	return pix

	# File uploader
	uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

	if uploaded_file is not None:
	pdfCol1, pdfCol2 = st.columns(2)

	pdf_bytes = io.BytesIO(uploaded_file.read())
	doc = pymupdf.open(stream=pdf_bytes)

	with pdfCol1:
	st.header("Page Viewer")
	page_num = st.slider("Select page", min_value=1, max_value=len(doc), value=1) - 1
	page = doc[page_num]

	with pdfCol2:
	st.header("PDF Metadata")
	metadata = doc.metadata
	st.json(metadata)

	col1, col2 = st.columns(2)

	# Extract text blocks (unordered and ordered)
	unordered_blocks = extract_text_blocks(page, sort_order=False)
	ordered_blocks = extract_text_blocks(page, sort_order=True)

	unordered_text = "\n".join([b[4] for b in unordered_blocks])
	ordered_text = "\n".join([b[4] for b in ordered_blocks])

	# Add toggle for viewing unordered chunks
	show_unordered = st.checkbox("Show Unordered Chunks (Raw Text Order)")

	# Choose which version of blocks to process
	selected_blocks = unordered_blocks if show_unordered else ordered_blocks

	st.subheader("RAW TEXT REAL")
	st.code(page.get_textpage().extractTEXT(sort=True), language="text")

	with col2:
	st.subheader("Raw Text")
	st.subheader("Text Chunking")
	split_regex = st.text_input("Split regex:", DEFAULT_SPLIT_REGEX)
	chunks_per_group = st.number_input("Sentences per chunk:", min_value=1, value=3)

	chunks = chunk_text(selected_blocks, split_regex, chunks_per_group)
	st.subheader(f"Chunks ({len(chunks)})")

	if chunks:
	current_chunk = st.slider("Select a Chunk", min_value=1, max_value=len(chunks), value=1, step=1)
	st.code(chunks[current_chunk-1][0], language="text")

	st.subheader("Full Text")
	st.code(unordered_text if show_unordered else ordered_text, language="text")

	with col1:
	st.subheader("Page Image")
	zoom = 2.0

	viz_mode = st.radio("Visualization mode:", ["Show Chunks", "Normal"])

	if viz_mode == "Show Chunks":
	pix = highlight_chunks(page, chunks, zoom)
	else:
	mat = pymupdf.Matrix(zoom, zoom)
	pix = page.get_pixmap(matrix=mat, alpha=False)

	img = pixmap_to_image(pix)
	st.image(img, use_container_width=True)