Skip to content

Instantly share code, notes, and snippets.

@OliPassey
Created October 31, 2024 12:27
Show Gist options
  • Save OliPassey/9b0de9d8cad20cfd19f73333985afebb to your computer and use it in GitHub Desktop.
Save OliPassey/9b0de9d8cad20cfd19f73333985afebb to your computer and use it in GitHub Desktop.
PDF & Document Scanner
"""
File Naming Automation Script
This script automatically renames PDF and Word files in a specified directory based on
their contents. It reads the first few lines of each file, attempting to identify a
relevant title or key phrase, and renames the file accordingly. For PDFs, the script
also includes OCR to handle scanned documents that lack embedded text.
## Requirements
1. Python 3.x
2. Install the required Python libraries:
- `pymupdf` for reading PDF text
- `pytesseract` and `Pillow` for OCR on scanned PDFs
- `python-docx` for reading Word documents
Install dependencies with:
pip install pymupdf python-docx pytesseract pillow
3. Install Tesseract OCR:
- **Windows**: Download from https://github.com/tesseract-ocr/tesseract/wiki and add
the installation path to your system PATH.
- **macOS**: Install with `brew install tesseract`
- **Linux** (Ubuntu/Debian): Install with `sudo apt install tesseract-ocr`
4. (Optional) Set the `pytesseract` path if Tesseract is installed in a non-standard location:
pytesseract.pytesseract.tesseract_cmd = r"/path/to/tesseract"
How to Use
Place your PDF and Word files in the specified source_dir directory.
Run the script, which will read the contents of each file and rename it based on its extracted title or first line.
The script automatically avoids name collisions by appending a numerical suffix to filenames if duplicates are detected.
python docScanner.py
"""
import os
import fitz # PyMuPDF for reading PDFs
import pytesseract
from PIL import Image
from io import BytesIO
from docx import Document
import re
source_dir = './docs' # Directory containing PDF/Word files
# Function to extract text from PDF, including OCR for scanned pages
def extract_text_from_pdf(file_path):
text = ""
try:
with fitz.open(file_path) as pdf:
for page_num in range(min(3, pdf.page_count)): # Check the first 3 pages
page = pdf[page_num]
page_text = page.get_text()
if page_text.strip(): # Text is found, add it
text += page_text
else:
# No text found, use OCR on the page image
pix = page.get_pixmap() # Get a raster image of the page
img = Image.open(BytesIO(pix.tobytes("png"))) # Convert to PIL image
ocr_text = pytesseract.image_to_string(img)
text += ocr_text
except Exception as e:
print(f"Error reading PDF {file_path}: {e}")
return text
# Function to extract text from Word document
def extract_text_from_word(file_path):
text = ""
try:
doc = Document(file_path)
for paragraph in doc.paragraphs:
text += paragraph.text + " "
except Exception as e:
print(f"Error reading Word file {file_path}: {e}")
return text
# Function to generate a filename based on content
def generate_filename(text):
# Extract first sentence or title-like line
first_line = re.split(r'\n|\.', text)[0]
filename = re.sub(r'\W+', '_', first_line)[:50] # Limit length, replace spaces with underscores
return filename.strip("_")
# Main function to rename files
def rename_files():
for filename in os.listdir(source_dir):
file_path = os.path.join(source_dir, filename)
if filename.lower().endswith('.pdf'):
text = extract_text_from_pdf(file_path)
elif filename.lower().endswith('.docx'):
text = extract_text_from_word(file_path)
else:
continue # Skip files that are not PDFs or DOCX
# Generate the base filename
base_name = generate_filename(text)
if not base_name:
continue # Skip if no name could be generated
# Prepare the new file path
new_file_path = os.path.join(source_dir, base_name + os.path.splitext(filename)[1])
counter = 1
# Check if the file exists and add a suffix if needed
while os.path.exists(new_file_path):
new_file_path = os.path.join(source_dir, f"{base_name}_{counter}" + os.path.splitext(filename)[1])
counter += 1
# Rename the file
os.rename(file_path, new_file_path)
print(f"Renamed '{filename}' to '{os.path.basename(new_file_path)}'")
# Run the renaming function
rename_files()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment