Created
October 31, 2024 12:27
-
-
Save OliPassey/9b0de9d8cad20cfd19f73333985afebb to your computer and use it in GitHub Desktop.
PDF & Document Scanner
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
File Naming Automation Script | |
This script automatically renames PDF and Word files in a specified directory based on | |
their contents. It reads the first few lines of each file, attempting to identify a | |
relevant title or key phrase, and renames the file accordingly. For PDFs, the script | |
also includes OCR to handle scanned documents that lack embedded text. | |
## Requirements | |
1. Python 3.x | |
2. Install the required Python libraries: | |
- `pymupdf` for reading PDF text | |
- `pytesseract` and `Pillow` for OCR on scanned PDFs | |
- `python-docx` for reading Word documents | |
Install dependencies with: | |
pip install pymupdf python-docx pytesseract pillow | |
3. Install Tesseract OCR: | |
- **Windows**: Download from https://github.com/tesseract-ocr/tesseract/wiki and add | |
the installation path to your system PATH. | |
- **macOS**: Install with `brew install tesseract` | |
- **Linux** (Ubuntu/Debian): Install with `sudo apt install tesseract-ocr` | |
4. (Optional) Set the `pytesseract` path if Tesseract is installed in a non-standard location: | |
pytesseract.pytesseract.tesseract_cmd = r"/path/to/tesseract" | |
How to Use | |
Place your PDF and Word files in the specified source_dir directory. | |
Run the script, which will read the contents of each file and rename it based on its extracted title or first line. | |
The script automatically avoids name collisions by appending a numerical suffix to filenames if duplicates are detected. | |
python docScanner.py | |
""" | |
import os | |
import fitz # PyMuPDF for reading PDFs | |
import pytesseract | |
from PIL import Image | |
from io import BytesIO | |
from docx import Document | |
import re | |
source_dir = './docs' # Directory containing PDF/Word files | |
# Function to extract text from PDF, including OCR for scanned pages | |
def extract_text_from_pdf(file_path): | |
text = "" | |
try: | |
with fitz.open(file_path) as pdf: | |
for page_num in range(min(3, pdf.page_count)): # Check the first 3 pages | |
page = pdf[page_num] | |
page_text = page.get_text() | |
if page_text.strip(): # Text is found, add it | |
text += page_text | |
else: | |
# No text found, use OCR on the page image | |
pix = page.get_pixmap() # Get a raster image of the page | |
img = Image.open(BytesIO(pix.tobytes("png"))) # Convert to PIL image | |
ocr_text = pytesseract.image_to_string(img) | |
text += ocr_text | |
except Exception as e: | |
print(f"Error reading PDF {file_path}: {e}") | |
return text | |
# Function to extract text from Word document | |
def extract_text_from_word(file_path): | |
text = "" | |
try: | |
doc = Document(file_path) | |
for paragraph in doc.paragraphs: | |
text += paragraph.text + " " | |
except Exception as e: | |
print(f"Error reading Word file {file_path}: {e}") | |
return text | |
# Function to generate a filename based on content | |
def generate_filename(text): | |
# Extract first sentence or title-like line | |
first_line = re.split(r'\n|\.', text)[0] | |
filename = re.sub(r'\W+', '_', first_line)[:50] # Limit length, replace spaces with underscores | |
return filename.strip("_") | |
# Main function to rename files | |
def rename_files(): | |
for filename in os.listdir(source_dir): | |
file_path = os.path.join(source_dir, filename) | |
if filename.lower().endswith('.pdf'): | |
text = extract_text_from_pdf(file_path) | |
elif filename.lower().endswith('.docx'): | |
text = extract_text_from_word(file_path) | |
else: | |
continue # Skip files that are not PDFs or DOCX | |
# Generate the base filename | |
base_name = generate_filename(text) | |
if not base_name: | |
continue # Skip if no name could be generated | |
# Prepare the new file path | |
new_file_path = os.path.join(source_dir, base_name + os.path.splitext(filename)[1]) | |
counter = 1 | |
# Check if the file exists and add a suffix if needed | |
while os.path.exists(new_file_path): | |
new_file_path = os.path.join(source_dir, f"{base_name}_{counter}" + os.path.splitext(filename)[1]) | |
counter += 1 | |
# Rename the file | |
os.rename(file_path, new_file_path) | |
print(f"Renamed '{filename}' to '{os.path.basename(new_file_path)}'") | |
# Run the renaming function | |
rename_files() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment