Skip to content

Instantly share code, notes, and snippets.

@saeed9321
Created September 16, 2024 09:48
Show Gist options
  • Save saeed9321/bef2bdda8c7b93d7128ccb62861a23d4 to your computer and use it in GitHub Desktop.
Save saeed9321/bef2bdda8c7b93d7128ccb62861a23d4 to your computer and use it in GitHub Desktop.
Text extraction from PDF using 3 different methods
import pypdf
import easyocr
import pytesseract
import os
import fitz
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\{USER_NAME}\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"
FILE_PATH = "2024-V4.pdf"
def readPdf(pdf_path):
try:
content = pypdf.PdfReader(pdf_path)
first_page = content.pages[0]
print(first_page.extract_text())
except:
print("There was a problem in reading PDF file")
def ocrPdf(pdf_path, method):
extracted_text = ""
# OCR reader
reader = easyocr.Reader(["en"], True)
# Convert PDF to images
pdf = fitz.open(pdf_path)
for index, page in enumerate(pdf.pages()):
image = page.get_pixmap()
image.pil_save(f"ocr-image-test-{index}.png")
# Read Text from images
for file in os.listdir(os.getcwd()):
if 'ocr-image-test-' in file:
if method == "easyocr":
content = reader.readtext(file, detail=False)
extracted_text += f' {" ".join(content)}'
if method == "pytesseract":
content = pytesseract.image_to_string(file)
extracted_text += content
# Delete garbage
for file in os.listdir(os.getcwd()):
if 'ocr-image-test-' in file:
os.remove(file)
print(extracted_text)
def main(pdf_file):
# Check if file exists
exist = os.path.exists(pdf_file)
if not exist:
print("CV file does not exist")
return
# readPdf(pdf_file)
# ocrPdf(pdf_file, "pytesseract")
ocrPdf(pdf_file, "easyocr")
main(FILE_PATH)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment