Created
November 21, 2024 21:32
-
-
Save Ademking/2d605cb6dd8c8d79a9360ee9ac0126ab to your computer and use it in GitHub Desktop.
Convert list of PDF documents to text files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from pypdf import PdfReader | |
# Get all PDF files in the current directory | |
pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')] | |
for pdf_file in pdf_files: | |
# Create a PDF reader object | |
reader = PdfReader(pdf_file) | |
# Extract text from all pages | |
text = '' | |
for page in reader.pages: | |
text += page.extract_text() | |
# Save the extracted text to a .txt file with the same name | |
txt_file = pdf_file.replace('.pdf', '.txt') | |
with open(txt_file, 'w', encoding='utf-8') as f: | |
f.write(text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment