Skip to content

Instantly share code, notes, and snippets.

@bpgergo
Created May 28, 2026 07:19
Show Gist options
  • Select an option

  • Save bpgergo/aa7ba57e5400d475f6361be6b4e5b047 to your computer and use it in GitHub Desktop.

Select an option

Save bpgergo/aa7ba57e5400d475f6361be6b4e5b047 to your computer and use it in GitHub Desktop.
Create name index from a pdf book.
#!/usr/bin/env python3
"""
Name index generator: finds page numbers for names in a PDF book.
Usage:
python name_index.py <pdf_file> <names_file> [output_file]
Names file format (one per line):
John Smith -> searches for "John Smith"
Forman, Milos -> searches for "Forman" (only the part before the comma)
Output format:
Name page1, page2, page3
"""
import re
import argparse
import pdfplumber
# Sequences removed from extracted text (e.g. soft hyphens, hyphen+newline).
# Applied before the replace set so that "For-\nman" becomes "Forman",
# not "For man".
REMOVE_SET = [
"-\r\n", # hyphen + Windows line ending
"-\n", # hyphen + Unix line ending
"-\r", # hyphen + old Mac line ending
"\xad", # Unicode soft hyphen (U+00AD)
]
# Sequences replaced with a single space (e.g. line endings that split words).
REPLACE_SET = [
"\r\n", # Windows line ending
"\r", # old Mac line ending
"\n", # Unix line ending
]
def normalize_text(text):
"""Apply remove-set and replace-set transformations, then collapse spaces."""
for seq in REMOVE_SET:
text = text.replace(seq, "")
for seq in REPLACE_SET:
text = text.replace(seq, " ")
# Collapse runs of spaces that may result from the replacements above
text = re.sub(r" {2,}", " ", text)
return text
def load_names(names_file):
names = []
with open(names_file, encoding="utf-8") as f:
for line in f:
name = line.strip()
if name:
names.append(name)
return names
def get_search_term(name):
"""
'Forman, Milos' -> 'Forman' (search only the part before the comma)
'John Smith' -> 'John Smith'
"""
if "," in name:
split_char = ','
else:
split_char = ' '
return name.split(split_char, 1)[0].strip()
def build_index(pdf_path, names, stop_page):
search_terms = [(name, get_search_term(name).lower()) for name in names]
index = {name: [] for name in names}
with pdfplumber.open(pdf_path) as pdf:
total = len(pdf.pages)
for page_num, page in enumerate(pdf.pages, start=1):
if page_num == stop_page:
print(f'Stopping processing at {stop_page}')
break
if page_num % 50 == 0 or page_num == total:
print(f" Page {page_num}/{total}", end="\r")
text = page.extract_text()
if not text:
continue
text_lower = normalize_text(text).lower()
for name, term in search_terms:
if term in text_lower:
index[name].append(page_num)
print()
return index
def write_output(index, names, output_path):
with open(output_path, "w", encoding="utf-8") as f:
for name in names:
pages_str = ", ".join(str(p) for p in index[name])
f.write(f"{name} {pages_str}\n")
def main():
parser = argparse.ArgumentParser(description="Create a name index from a PDF book.")
parser.add_argument("pdf_file", help="Input PDF file")
parser.add_argument("names_file", help="Text file with names, one per line")
parser.add_argument(
"output_file",
nargs="?",
default="index.txt",
help="Output file (default: index.txt)",
)
parser.add_argument("stop_page", help="Stop processing pdf at this page", default="1")
args = parser.parse_args()
print(f"Loading names from '{args.names_file}'...")
names = load_names(args.names_file)
print(f" {len(names)} names loaded.")
stop_page = int(args.stop_page)
print(f"Scanning '{args.pdf_file}'... stop page: {stop_page}")
index = build_index(args.pdf_file, names, stop_page)
print(f"Writing output to '{args.output_file}'...")
write_output(index, names, args.output_file)
found = sum(1 for pages in index.values() if pages)
not_found = len(names) - found
print(f"Done. {found}/{len(names)} names found.", end="")
if not_found:
print(f" ({not_found} not found — included with empty page list.)")
else:
print()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment