Created
March 5, 2026 14:04
-
-
Save flash42/6d3b624b7d26dcac4d5781c65c87d7a2 to your computer and use it in GitHub Desktop.
pdf_page_check.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| class PDFMiniError(Exception): | |
| pass | |
| def _read_tail(fp, n=65536): | |
| fp.seek(0, 2) | |
| size = fp.tell() | |
| fp.seek(max(0, size - n), 0) | |
| return fp.read() | |
| def _find_startxref(tail: bytes) -> int: | |
| m = re.search(rb"startxref\s+(\d+)\s+%%EOF", tail) | |
| if not m: | |
| m = re.search(rb"startxref\s+(\d+)\s*", tail) | |
| if not m: | |
| raise PDFMiniError("Could not find startxref in file tail.") | |
| return int(m.group(1)) | |
| def _read_at(fp, offset, n=4096): | |
| fp.seek(offset, 0) | |
| return fp.read(n) | |
| def _skip_ws(b: bytes, i: int) -> int: | |
| while i < len(b) and b[i] in b" \t\r\n\f\0": | |
| i += 1 | |
| return i | |
| def _parse_pdf_name(b: bytes, i: int): | |
| i += 1 | |
| j = i | |
| while j < len(b) and b[j] not in b" \t\r\n\f\0/[]()<>%": | |
| j += 1 | |
| return b[i:j].decode("latin-1"), j | |
| def _parse_pdf_int(b: bytes, i: int): | |
| i = _skip_ws(b, i) | |
| m = re.match(rb"[+-]?\d+", b[i:]) | |
| if not m: | |
| raise PDFMiniError("Expected integer.") | |
| s = m.group(0) | |
| return int(s), i + len(s) | |
| def _parse_literal_dict(b: bytes, i: int): | |
| if b[i:i+2] != b"<<": | |
| raise PDFMiniError("Expected '<<' dictionary.") | |
| i += 2 | |
| d = {} | |
| while True: | |
| i = _skip_ws(b, i) | |
| if b[i:i+2] == b">>": | |
| i += 2 | |
| return d, i | |
| if i >= len(b): | |
| raise PDFMiniError("Unterminated dictionary in buffer.") | |
| if b[i] != ord("/"): | |
| i += 1 | |
| continue | |
| key, i = _parse_pdf_name(b, i) | |
| i = _skip_ws(b, i) | |
| try: | |
| a, j = _parse_pdf_int(b, i) | |
| j = _skip_ws(b, j) | |
| bnum, k = _parse_pdf_int(b, j) | |
| k = _skip_ws(b, k) | |
| if k < len(b) and b[k:k+1] == b"R": | |
| d[key] = ("ref", a, bnum) | |
| i = k + 1 | |
| continue | |
| except PDFMiniError: | |
| pass | |
| try: | |
| val, j = _parse_pdf_int(b, i) | |
| d[key] = val | |
| i = j | |
| continue | |
| except PDFMiniError: | |
| pass | |
| if i < len(b) and b[i] == ord("/"): | |
| name, j = _parse_pdf_name(b, i) | |
| d[key] = ("name", name) | |
| i = j | |
| continue | |
| if b[i:i+2] == b"<<": | |
| nd, j = _parse_literal_dict(b, i) | |
| d[key] = nd | |
| i = j | |
| continue | |
| d[key] = None | |
| i += 1 | |
| def _find_object_offset_streaming(fp, obj_num: int, gen_num: int, chunk_size=65536): | |
| pat = re.compile(rb"\b%d\s+%d\s+obj\b" % (obj_num, gen_num)) | |
| overlap = b"" | |
| base_offset = 0 | |
| fp.seek(0, 0) | |
| while True: | |
| chunk = fp.read(chunk_size) | |
| if not chunk: | |
| return None | |
| data = overlap + chunk | |
| m = pat.search(data) | |
| if m: | |
| return base_offset - len(overlap) + m.start() | |
| # Keep enough bytes to catch a boundary-split pattern | |
| overlap = data[-64:] | |
| base_offset += len(chunk) | |
| def _read_object(fp, obj_num: int, gen_num: int, chunk_size=65536): | |
| obj_offset = _find_object_offset_streaming(fp, obj_num, gen_num, chunk_size=chunk_size) | |
| if obj_offset is None: | |
| raise PDFMiniError(f"Object {obj_num} {gen_num} not found.") | |
| fp.seek(obj_offset, 0) | |
| # Read forward until endobj, without buffering the whole file | |
| parts = [] | |
| while True: | |
| chunk = fp.read(chunk_size) | |
| if not chunk: | |
| raise PDFMiniError(f"endobj not found for {obj_num} {gen_num}.") | |
| parts.append(chunk) | |
| joined = b"".join(parts) | |
| end = joined.find(b"endobj") | |
| if end != -1: | |
| m = re.search(rb"\b%d\s+%d\s+obj\b" % (obj_num, gen_num), joined) | |
| if not m: | |
| raise PDFMiniError(f"Object header vanished for {obj_num} {gen_num}.") | |
| return joined[m.end():end].strip() | |
| # Prevent unbounded growth if endobj is weirdly far away | |
| if sum(len(p) for p in parts) > 2_000_000: | |
| raise PDFMiniError(f"Object {obj_num} {gen_num} is too large for this minimal reader.") | |
| def pdf_page_count_minimal(path: str) -> int: | |
| with open(path, "rb") as fp: | |
| tail = _read_tail(fp, n=65536) | |
| sx = _find_startxref(tail) | |
| head = _read_at(fp, sx, n=16384) | |
| if not head.lstrip().startswith(b"xref"): | |
| raise PDFMiniError("This looks like an xref stream or non-classic xref; minimal reader can't handle it.") | |
| tpos = head.find(b"trailer") | |
| if tpos == -1: | |
| raise PDFMiniError("No trailer found near startxref.") | |
| dpos = head.find(b"<<", tpos) | |
| if dpos == -1: | |
| raise PDFMiniError("Trailer dictionary '<<' not found.") | |
| trailer, _ = _parse_literal_dict(head, dpos) | |
| root = trailer.get("Root") | |
| if not (isinstance(root, tuple) and root[0] == "ref"): | |
| raise PDFMiniError("Trailer missing /Root indirect reference.") | |
| _, root_obj, root_gen = root | |
| root_obj_bytes = _read_object(fp, root_obj, root_gen) | |
| p = root_obj_bytes.find(b"<<") | |
| if p == -1: | |
| raise PDFMiniError("Root object is not a dictionary in this minimal reader.") | |
| root_dict, _ = _parse_literal_dict(root_obj_bytes, p) | |
| pages = root_dict.get("Pages") | |
| if not (isinstance(pages, tuple) and pages[0] == "ref"): | |
| raise PDFMiniError("Root dictionary missing /Pages reference.") | |
| _, pages_obj, pages_gen = pages | |
| pages_obj_bytes = _read_object(fp, pages_obj, pages_gen) | |
| pp = pages_obj_bytes.find(b"<<") | |
| if pp == -1: | |
| raise PDFMiniError("Pages object is not a dictionary in this minimal reader.") | |
| pages_dict, _ = _parse_literal_dict(pages_obj_bytes, pp) | |
| count = pages_dict.get("Count") | |
| if not isinstance(count, int): | |
| raise PDFMiniError("/Count not found as integer on /Pages object.") | |
| return count | |
| if __name__ == "__main__": | |
| print(pdf_page_count_minimal("file.pdf")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment