Skip to content

Instantly share code, notes, and snippets.

@flash42
Created March 5, 2026 14:04
Show Gist options
  • Select an option

  • Save flash42/6d3b624b7d26dcac4d5781c65c87d7a2 to your computer and use it in GitHub Desktop.

Select an option

Save flash42/6d3b624b7d26dcac4d5781c65c87d7a2 to your computer and use it in GitHub Desktop.
pdf_page_check.py
import re
class PDFMiniError(Exception):
pass
def _read_tail(fp, n=65536):
fp.seek(0, 2)
size = fp.tell()
fp.seek(max(0, size - n), 0)
return fp.read()
def _find_startxref(tail: bytes) -> int:
m = re.search(rb"startxref\s+(\d+)\s+%%EOF", tail)
if not m:
m = re.search(rb"startxref\s+(\d+)\s*", tail)
if not m:
raise PDFMiniError("Could not find startxref in file tail.")
return int(m.group(1))
def _read_at(fp, offset, n=4096):
fp.seek(offset, 0)
return fp.read(n)
def _skip_ws(b: bytes, i: int) -> int:
while i < len(b) and b[i] in b" \t\r\n\f\0":
i += 1
return i
def _parse_pdf_name(b: bytes, i: int):
i += 1
j = i
while j < len(b) and b[j] not in b" \t\r\n\f\0/[]()<>%":
j += 1
return b[i:j].decode("latin-1"), j
def _parse_pdf_int(b: bytes, i: int):
i = _skip_ws(b, i)
m = re.match(rb"[+-]?\d+", b[i:])
if not m:
raise PDFMiniError("Expected integer.")
s = m.group(0)
return int(s), i + len(s)
def _parse_literal_dict(b: bytes, i: int):
if b[i:i+2] != b"<<":
raise PDFMiniError("Expected '<<' dictionary.")
i += 2
d = {}
while True:
i = _skip_ws(b, i)
if b[i:i+2] == b">>":
i += 2
return d, i
if i >= len(b):
raise PDFMiniError("Unterminated dictionary in buffer.")
if b[i] != ord("/"):
i += 1
continue
key, i = _parse_pdf_name(b, i)
i = _skip_ws(b, i)
try:
a, j = _parse_pdf_int(b, i)
j = _skip_ws(b, j)
bnum, k = _parse_pdf_int(b, j)
k = _skip_ws(b, k)
if k < len(b) and b[k:k+1] == b"R":
d[key] = ("ref", a, bnum)
i = k + 1
continue
except PDFMiniError:
pass
try:
val, j = _parse_pdf_int(b, i)
d[key] = val
i = j
continue
except PDFMiniError:
pass
if i < len(b) and b[i] == ord("/"):
name, j = _parse_pdf_name(b, i)
d[key] = ("name", name)
i = j
continue
if b[i:i+2] == b"<<":
nd, j = _parse_literal_dict(b, i)
d[key] = nd
i = j
continue
d[key] = None
i += 1
def _find_object_offset_streaming(fp, obj_num: int, gen_num: int, chunk_size=65536):
pat = re.compile(rb"\b%d\s+%d\s+obj\b" % (obj_num, gen_num))
overlap = b""
base_offset = 0
fp.seek(0, 0)
while True:
chunk = fp.read(chunk_size)
if not chunk:
return None
data = overlap + chunk
m = pat.search(data)
if m:
return base_offset - len(overlap) + m.start()
# Keep enough bytes to catch a boundary-split pattern
overlap = data[-64:]
base_offset += len(chunk)
def _read_object(fp, obj_num: int, gen_num: int, chunk_size=65536):
obj_offset = _find_object_offset_streaming(fp, obj_num, gen_num, chunk_size=chunk_size)
if obj_offset is None:
raise PDFMiniError(f"Object {obj_num} {gen_num} not found.")
fp.seek(obj_offset, 0)
# Read forward until endobj, without buffering the whole file
parts = []
while True:
chunk = fp.read(chunk_size)
if not chunk:
raise PDFMiniError(f"endobj not found for {obj_num} {gen_num}.")
parts.append(chunk)
joined = b"".join(parts)
end = joined.find(b"endobj")
if end != -1:
m = re.search(rb"\b%d\s+%d\s+obj\b" % (obj_num, gen_num), joined)
if not m:
raise PDFMiniError(f"Object header vanished for {obj_num} {gen_num}.")
return joined[m.end():end].strip()
# Prevent unbounded growth if endobj is weirdly far away
if sum(len(p) for p in parts) > 2_000_000:
raise PDFMiniError(f"Object {obj_num} {gen_num} is too large for this minimal reader.")
def pdf_page_count_minimal(path: str) -> int:
with open(path, "rb") as fp:
tail = _read_tail(fp, n=65536)
sx = _find_startxref(tail)
head = _read_at(fp, sx, n=16384)
if not head.lstrip().startswith(b"xref"):
raise PDFMiniError("This looks like an xref stream or non-classic xref; minimal reader can't handle it.")
tpos = head.find(b"trailer")
if tpos == -1:
raise PDFMiniError("No trailer found near startxref.")
dpos = head.find(b"<<", tpos)
if dpos == -1:
raise PDFMiniError("Trailer dictionary '<<' not found.")
trailer, _ = _parse_literal_dict(head, dpos)
root = trailer.get("Root")
if not (isinstance(root, tuple) and root[0] == "ref"):
raise PDFMiniError("Trailer missing /Root indirect reference.")
_, root_obj, root_gen = root
root_obj_bytes = _read_object(fp, root_obj, root_gen)
p = root_obj_bytes.find(b"<<")
if p == -1:
raise PDFMiniError("Root object is not a dictionary in this minimal reader.")
root_dict, _ = _parse_literal_dict(root_obj_bytes, p)
pages = root_dict.get("Pages")
if not (isinstance(pages, tuple) and pages[0] == "ref"):
raise PDFMiniError("Root dictionary missing /Pages reference.")
_, pages_obj, pages_gen = pages
pages_obj_bytes = _read_object(fp, pages_obj, pages_gen)
pp = pages_obj_bytes.find(b"<<")
if pp == -1:
raise PDFMiniError("Pages object is not a dictionary in this minimal reader.")
pages_dict, _ = _parse_literal_dict(pages_obj_bytes, pp)
count = pages_dict.get("Count")
if not isinstance(count, int):
raise PDFMiniError("/Count not found as integer on /Pages object.")
return count
if __name__ == "__main__":
print(pdf_page_count_minimal("file.pdf"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment