bpgergo · May 28, 2026 07:19
diff --git a/name_index.py b/name_index.py
 #!/usr/bin/env python3
 """
 Name index generator: finds page numbers for names in a PDF book.

 Usage:
    python name_index.py <pdf_file> <names_file> [output_file]

 Names file format (one per line):
    John Smith          -> searches for "John Smith"
    Forman, Milos       -> searches for "Forman" (only the part before the comma)

 Output format:
    Name   page1, page2, page3
 """

 import re
 import argparse
 import pdfplumber


 # Sequences removed from extracted text (e.g. soft hyphens, hyphen+newline).
 # Applied before the replace set so that "For-\nman" becomes "Forman",
 # not "For man".
 REMOVE_SET = [
    "-\r\n",   # hyphen + Windows line ending
    "-\n",     # hyphen + Unix line ending
    "-\r",     # hyphen + old Mac line ending
    "\xad",    # Unicode soft hyphen (U+00AD)
 ]

 # Sequences replaced with a single space (e.g. line endings that split words).
 REPLACE_SET = [
    "\r\n",    # Windows line ending
    "\r",      # old Mac line ending
    "\n",      # Unix line ending
 ]


 def normalize_text(text):
    """Apply remove-set and replace-set transformations, then collapse spaces."""
    for seq in REMOVE_SET:
        text = text.replace(seq, "")
    for seq in REPLACE_SET:
        text = text.replace(seq, " ")
    # Collapse runs of spaces that may result from the replacements above
    text = re.sub(r" {2,}", " ", text)
    return text


 def load_names(names_file):
    names = []
    with open(names_file, encoding="utf-8") as f:
        for line in f:
            name = line.strip()
            if name:
                names.append(name)
    return names


 def get_search_term(name):
    """
    'Forman, Milos'  ->  'Forman'  (search only the part before the comma)
    'John Smith'     ->  'John Smith'
    """
    if "," in name:
        split_char = ','
    else:
        split_char = ' '
    return name.split(split_char, 1)[0].strip()
 

 def build_index(pdf_path, names, stop_page):
    search_terms = [(name, get_search_term(name).lower()) for name in names]
    index = {name: [] for name in names}

    with pdfplumber.open(pdf_path) as pdf:
        total = len(pdf.pages)
        for page_num, page in enumerate(pdf.pages, start=1):
            if page_num == stop_page:
                print(f'Stopping processing at {stop_page}')
                break
            if page_num % 50 == 0 or page_num == total:
                print(f"  Page {page_num}/{total}", end="\r")
            text = page.extract_text()
            if not text:
                continue
            text_lower = normalize_text(text).lower()
            for name, term in search_terms:
                if term in text_lower:
                    index[name].append(page_num)

    print()
    return index


 def write_output(index, names, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        for name in names:
            pages_str = ", ".join(str(p) for p in index[name])
            f.write(f"{name}   {pages_str}\n")


 def main():
    parser = argparse.ArgumentParser(description="Create a name index from a PDF book.")
    parser.add_argument("pdf_file", help="Input PDF file")
    parser.add_argument("names_file", help="Text file with names, one per line")
    parser.add_argument(
        "output_file",
        nargs="?",
        default="index.txt",
        help="Output file (default: index.txt)",
    )
    parser.add_argument("stop_page", help="Stop processing pdf at this page", default="1")
    args = parser.parse_args()

    print(f"Loading names from '{args.names_file}'...")
    names = load_names(args.names_file)
    print(f"  {len(names)} names loaded.")

    stop_page = int(args.stop_page)
    print(f"Scanning '{args.pdf_file}'...  stop page: {stop_page}")
    index = build_index(args.pdf_file, names, stop_page)

    print(f"Writing output to '{args.output_file}'...")
    write_output(index, names, args.output_file)

    found = sum(1 for pages in index.values() if pages)
    not_found = len(names) - found
    print(f"Done. {found}/{len(names)} names found.", end="")
    if not_found:
        print(f" ({not_found} not found — included with empty page list.)")
    else:
        print()


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Name index generator: finds page numbers for names in a PDF book.

	Usage:
	python name_index.py <pdf_file> <names_file> [output_file]

	Names file format (one per line):
	John Smith -> searches for "John Smith"
	Forman, Milos -> searches for "Forman" (only the part before the comma)

	Output format:
	Name page1, page2, page3
	"""

	import re
	import argparse
	import pdfplumber


	# Sequences removed from extracted text (e.g. soft hyphens, hyphen+newline).
	# Applied before the replace set so that "For-\nman" becomes "Forman",
	# not "For man".
	REMOVE_SET = [
	"-\r\n", # hyphen + Windows line ending
	"-\n", # hyphen + Unix line ending
	"-\r", # hyphen + old Mac line ending
	"\xad", # Unicode soft hyphen (U+00AD)
	]

	# Sequences replaced with a single space (e.g. line endings that split words).
	REPLACE_SET = [
	"\r\n", # Windows line ending
	"\r", # old Mac line ending
	"\n", # Unix line ending
	]


	def normalize_text(text):
	"""Apply remove-set and replace-set transformations, then collapse spaces."""
	for seq in REMOVE_SET:
	text = text.replace(seq, "")
	for seq in REPLACE_SET:
	text = text.replace(seq, " ")
	# Collapse runs of spaces that may result from the replacements above
	text = re.sub(r" {2,}", " ", text)
	return text


	def load_names(names_file):
	names = []
	with open(names_file, encoding="utf-8") as f:
	for line in f:
	name = line.strip()
	if name:
	names.append(name)
	return names


	def get_search_term(name):
	"""
	'Forman, Milos' -> 'Forman' (search only the part before the comma)
	'John Smith' -> 'John Smith'
	"""
	if "," in name:
	split_char = ','
	else:
	split_char = ' '
	return name.split(split_char, 1)[0].strip()


	def build_index(pdf_path, names, stop_page):
	search_terms = [(name, get_search_term(name).lower()) for name in names]
	index = {name: [] for name in names}

	with pdfplumber.open(pdf_path) as pdf:
	total = len(pdf.pages)
	for page_num, page in enumerate(pdf.pages, start=1):
	if page_num == stop_page:
	print(f'Stopping processing at {stop_page}')
	break
	if page_num % 50 == 0 or page_num == total:
	print(f" Page {page_num}/{total}", end="\r")
	text = page.extract_text()
	if not text:
	continue
	text_lower = normalize_text(text).lower()
	for name, term in search_terms:
	if term in text_lower:
	index[name].append(page_num)

	print()
	return index


	def write_output(index, names, output_path):
	with open(output_path, "w", encoding="utf-8") as f:
	for name in names:
	pages_str = ", ".join(str(p) for p in index[name])
	f.write(f"{name} {pages_str}\n")


	def main():
	parser = argparse.ArgumentParser(description="Create a name index from a PDF book.")
	parser.add_argument("pdf_file", help="Input PDF file")
	parser.add_argument("names_file", help="Text file with names, one per line")
	parser.add_argument(
	"output_file",
	nargs="?",
	default="index.txt",
	help="Output file (default: index.txt)",
	)
	parser.add_argument("stop_page", help="Stop processing pdf at this page", default="1")
	args = parser.parse_args()

	print(f"Loading names from '{args.names_file}'...")
	names = load_names(args.names_file)
	print(f" {len(names)} names loaded.")

	stop_page = int(args.stop_page)
	print(f"Scanning '{args.pdf_file}'... stop page: {stop_page}")
	index = build_index(args.pdf_file, names, stop_page)

	print(f"Writing output to '{args.output_file}'...")
	write_output(index, names, args.output_file)

	found = sum(1 for pages in index.values() if pages)
	not_found = len(names) - found
	print(f"Done. {found}/{len(names)} names found.", end="")
	if not_found:
	print(f" ({not_found} not found — included with empty page list.)")
	else:
	print()


	if __name__ == "__main__":
	main()
No results found