OliPassey · October 31, 2024 12:27
diff --git a/docScanner.py b/docScanner.py
 """
 File Naming Automation Script

 This script automatically renames PDF and Word files in a specified directory based on 
 their contents. It reads the first few lines of each file, attempting to identify a 
 relevant title or key phrase, and renames the file accordingly. For PDFs, the script 
 also includes OCR to handle scanned documents that lack embedded text.

 ## Requirements
 1. Python 3.x
 2. Install the required Python libraries:
   - `pymupdf` for reading PDF text
   - `pytesseract` and `Pillow` for OCR on scanned PDFs
   - `python-docx` for reading Word documents

   Install dependencies with:
   pip install pymupdf python-docx pytesseract pillow
   
   
 3. Install Tesseract OCR:
 - **Windows**: Download from https://github.com/tesseract-ocr/tesseract/wiki and add 
  the installation path to your system PATH.
 - **macOS**: Install with `brew install tesseract`
 - **Linux** (Ubuntu/Debian): Install with `sudo apt install tesseract-ocr`

 4. (Optional) Set the `pytesseract` path if Tesseract is installed in a non-standard location:
 pytesseract.pytesseract.tesseract_cmd = r"/path/to/tesseract"

 How to Use

    Place your PDF and Word files in the specified source_dir directory.
    Run the script, which will read the contents of each file and rename it based on its extracted title or first line.
    The script automatically avoids name collisions by appending a numerical suffix to filenames if duplicates are detected.

 python docScanner.py
 """

 import os
 import fitz  # PyMuPDF for reading PDFs
 import pytesseract
 from PIL import Image
 from io import BytesIO
 from docx import Document
 import re

 source_dir = './docs'  # Directory containing PDF/Word files

 # Function to extract text from PDF, including OCR for scanned pages
 def extract_text_from_pdf(file_path):
    text = ""
    try:
        with fitz.open(file_path) as pdf:
            for page_num in range(min(3, pdf.page_count)):  # Check the first 3 pages
                page = pdf[page_num]
                page_text = page.get_text()
                if page_text.strip():  # Text is found, add it
                    text += page_text
                else:
                    # No text found, use OCR on the page image
                    pix = page.get_pixmap()  # Get a raster image of the page
                    img = Image.open(BytesIO(pix.tobytes("png")))  # Convert to PIL image
                    ocr_text = pytesseract.image_to_string(img)
                    text += ocr_text
    except Exception as e:
        print(f"Error reading PDF {file_path}: {e}")
    return text

 # Function to extract text from Word document
 def extract_text_from_word(file_path):
    text = ""
    try:
        doc = Document(file_path)
        for paragraph in doc.paragraphs:
            text += paragraph.text + " "
    except Exception as e:
        print(f"Error reading Word file {file_path}: {e}")
    return text

 # Function to generate a filename based on content
 def generate_filename(text):
    # Extract first sentence or title-like line
    first_line = re.split(r'\n|\.', text)[0]
    filename = re.sub(r'\W+', '_', first_line)[:50]  # Limit length, replace spaces with underscores
    return filename.strip("_")

 # Main function to rename files
 def rename_files():
    for filename in os.listdir(source_dir):
        file_path = os.path.join(source_dir, filename)
        if filename.lower().endswith('.pdf'):
            text = extract_text_from_pdf(file_path)
        elif filename.lower().endswith('.docx'):
            text = extract_text_from_word(file_path)
        else:
            continue  # Skip files that are not PDFs or DOCX

        # Generate the base filename
        base_name = generate_filename(text)
        if not base_name:
            continue  # Skip if no name could be generated

        # Prepare the new file path
        new_file_path = os.path.join(source_dir, base_name + os.path.splitext(filename)[1])
        counter = 1

        # Check if the file exists and add a suffix if needed
        while os.path.exists(new_file_path):
            new_file_path = os.path.join(source_dir, f"{base_name}_{counter}" + os.path.splitext(filename)[1])
            counter += 1

        # Rename the file
        os.rename(file_path, new_file_path)
        print(f"Renamed '{filename}' to '{os.path.basename(new_file_path)}'")

 # Run the renaming function
 rename_files()
	"""
	File Naming Automation Script

	This script automatically renames PDF and Word files in a specified directory based on
	their contents. It reads the first few lines of each file, attempting to identify a
	relevant title or key phrase, and renames the file accordingly. For PDFs, the script
	also includes OCR to handle scanned documents that lack embedded text.

	## Requirements
	1. Python 3.x
	2. Install the required Python libraries:
	- `pymupdf` for reading PDF text
	- `pytesseract` and `Pillow` for OCR on scanned PDFs
	- `python-docx` for reading Word documents

	Install dependencies with:
	pip install pymupdf python-docx pytesseract pillow


	3. Install Tesseract OCR:
	- Windows: Download from https://github.com/tesseract-ocr/tesseract/wiki and add
	the installation path to your system PATH.
	- macOS: Install with `brew install tesseract`
	- Linux (Ubuntu/Debian): Install with `sudo apt install tesseract-ocr`

	4. (Optional) Set the `pytesseract` path if Tesseract is installed in a non-standard location:
	pytesseract.pytesseract.tesseract_cmd = r"/path/to/tesseract"

	How to Use

	Place your PDF and Word files in the specified source_dir directory.
	Run the script, which will read the contents of each file and rename it based on its extracted title or first line.
	The script automatically avoids name collisions by appending a numerical suffix to filenames if duplicates are detected.

	python docScanner.py
	"""

	import os
	import fitz # PyMuPDF for reading PDFs
	import pytesseract
	from PIL import Image
	from io import BytesIO
	from docx import Document
	import re

	source_dir = './docs' # Directory containing PDF/Word files

	# Function to extract text from PDF, including OCR for scanned pages
	def extract_text_from_pdf(file_path):
	text = ""
	try:
	with fitz.open(file_path) as pdf:
	for page_num in range(min(3, pdf.page_count)): # Check the first 3 pages
	page = pdf[page_num]
	page_text = page.get_text()
	if page_text.strip(): # Text is found, add it
	text += page_text
	else:
	# No text found, use OCR on the page image
	pix = page.get_pixmap() # Get a raster image of the page
	img = Image.open(BytesIO(pix.tobytes("png"))) # Convert to PIL image
	ocr_text = pytesseract.image_to_string(img)
	text += ocr_text
	except Exception as e:
	print(f"Error reading PDF {file_path}: {e}")
	return text

	# Function to extract text from Word document
	def extract_text_from_word(file_path):
	text = ""
	try:
	doc = Document(file_path)
	for paragraph in doc.paragraphs:
	text += paragraph.text + " "
	except Exception as e:
	print(f"Error reading Word file {file_path}: {e}")
	return text

	# Function to generate a filename based on content
	def generate_filename(text):
	# Extract first sentence or title-like line
	first_line = re.split(r'\n\|\.', text)[0]
	filename = re.sub(r'\W+', '_', first_line)[:50] # Limit length, replace spaces with underscores
	return filename.strip("_")

	# Main function to rename files
	def rename_files():
	for filename in os.listdir(source_dir):
	file_path = os.path.join(source_dir, filename)
	if filename.lower().endswith('.pdf'):
	text = extract_text_from_pdf(file_path)
	elif filename.lower().endswith('.docx'):
	text = extract_text_from_word(file_path)
	else:
	continue # Skip files that are not PDFs or DOCX

	# Generate the base filename
	base_name = generate_filename(text)
	if not base_name:
	continue # Skip if no name could be generated

	# Prepare the new file path
	new_file_path = os.path.join(source_dir, base_name + os.path.splitext(filename)[1])
	counter = 1

	# Check if the file exists and add a suffix if needed
	while os.path.exists(new_file_path):
	new_file_path = os.path.join(source_dir, f"{base_name}_{counter}" + os.path.splitext(filename)[1])
	counter += 1

	# Rename the file
	os.rename(file_path, new_file_path)
	print(f"Renamed '{filename}' to '{os.path.basename(new_file_path)}'")

	# Run the renaming function
	rename_files()