Created
May 28, 2026 07:19
-
-
Save bpgergo/aa7ba57e5400d475f6361be6b4e5b047 to your computer and use it in GitHub Desktop.
Create name index from a pdf book.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Name index generator: finds page numbers for names in a PDF book. | |
| Usage: | |
| python name_index.py <pdf_file> <names_file> [output_file] | |
| Names file format (one per line): | |
| John Smith -> searches for "John Smith" | |
| Forman, Milos -> searches for "Forman" (only the part before the comma) | |
| Output format: | |
| Name page1, page2, page3 | |
| """ | |
| import re | |
| import argparse | |
| import pdfplumber | |
| # Sequences removed from extracted text (e.g. soft hyphens, hyphen+newline). | |
| # Applied before the replace set so that "For-\nman" becomes "Forman", | |
| # not "For man". | |
| REMOVE_SET = [ | |
| "-\r\n", # hyphen + Windows line ending | |
| "-\n", # hyphen + Unix line ending | |
| "-\r", # hyphen + old Mac line ending | |
| "\xad", # Unicode soft hyphen (U+00AD) | |
| ] | |
| # Sequences replaced with a single space (e.g. line endings that split words). | |
| REPLACE_SET = [ | |
| "\r\n", # Windows line ending | |
| "\r", # old Mac line ending | |
| "\n", # Unix line ending | |
| ] | |
| def normalize_text(text): | |
| """Apply remove-set and replace-set transformations, then collapse spaces.""" | |
| for seq in REMOVE_SET: | |
| text = text.replace(seq, "") | |
| for seq in REPLACE_SET: | |
| text = text.replace(seq, " ") | |
| # Collapse runs of spaces that may result from the replacements above | |
| text = re.sub(r" {2,}", " ", text) | |
| return text | |
| def load_names(names_file): | |
| names = [] | |
| with open(names_file, encoding="utf-8") as f: | |
| for line in f: | |
| name = line.strip() | |
| if name: | |
| names.append(name) | |
| return names | |
| def get_search_term(name): | |
| """ | |
| 'Forman, Milos' -> 'Forman' (search only the part before the comma) | |
| 'John Smith' -> 'John Smith' | |
| """ | |
| if "," in name: | |
| split_char = ',' | |
| else: | |
| split_char = ' ' | |
| return name.split(split_char, 1)[0].strip() | |
| def build_index(pdf_path, names, stop_page): | |
| search_terms = [(name, get_search_term(name).lower()) for name in names] | |
| index = {name: [] for name in names} | |
| with pdfplumber.open(pdf_path) as pdf: | |
| total = len(pdf.pages) | |
| for page_num, page in enumerate(pdf.pages, start=1): | |
| if page_num == stop_page: | |
| print(f'Stopping processing at {stop_page}') | |
| break | |
| if page_num % 50 == 0 or page_num == total: | |
| print(f" Page {page_num}/{total}", end="\r") | |
| text = page.extract_text() | |
| if not text: | |
| continue | |
| text_lower = normalize_text(text).lower() | |
| for name, term in search_terms: | |
| if term in text_lower: | |
| index[name].append(page_num) | |
| print() | |
| return index | |
| def write_output(index, names, output_path): | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| for name in names: | |
| pages_str = ", ".join(str(p) for p in index[name]) | |
| f.write(f"{name} {pages_str}\n") | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Create a name index from a PDF book.") | |
| parser.add_argument("pdf_file", help="Input PDF file") | |
| parser.add_argument("names_file", help="Text file with names, one per line") | |
| parser.add_argument( | |
| "output_file", | |
| nargs="?", | |
| default="index.txt", | |
| help="Output file (default: index.txt)", | |
| ) | |
| parser.add_argument("stop_page", help="Stop processing pdf at this page", default="1") | |
| args = parser.parse_args() | |
| print(f"Loading names from '{args.names_file}'...") | |
| names = load_names(args.names_file) | |
| print(f" {len(names)} names loaded.") | |
| stop_page = int(args.stop_page) | |
| print(f"Scanning '{args.pdf_file}'... stop page: {stop_page}") | |
| index = build_index(args.pdf_file, names, stop_page) | |
| print(f"Writing output to '{args.output_file}'...") | |
| write_output(index, names, args.output_file) | |
| found = sum(1 for pages in index.values() if pages) | |
| not_found = len(names) - found | |
| print(f"Done. {found}/{len(names)} names found.", end="") | |
| if not_found: | |
| print(f" ({not_found} not found — included with empty page list.)") | |
| else: | |
| print() | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment