Created
February 7, 2021 19:17
-
-
Save KoStard/ccc41211615ea89b172c182976660861 to your computer and use it in GitHub Desktop.
OCR Armenian text from PDF and generate PDF from resullts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
python3 convert.py input_file.pdf output_file.pdf | |
""" | |
import pdf2image | |
import io | |
from PyPDF2 import PdfFileReader, PdfFileWriter | |
import sys | |
try: | |
from PIL import Image | |
except ImportError: | |
import Image | |
import pytesseract | |
def pdf_to_img(pdf_file_path): | |
return pdf2image.convert_from_path(pdf_file_path) | |
def ocr_core(file): | |
pdf = pytesseract.image_to_pdf_or_hocr(file, lang='hye') | |
return pdf | |
def print_pages(pdf_file_path, output_stream): | |
print("Getting images") | |
images = pdf_to_img(pdf_file_path) | |
writer = PdfFileWriter() | |
for pg, img in enumerate(images): | |
print(f"Processing page {pg}") | |
pdf = ocr_core(img) | |
reader = PdfFileReader(io.BytesIO(pdf)) | |
page = reader.getPage(0) | |
writer.addPage(page) | |
writer.write(output_stream) | |
with open(sys.argv[2], 'wb') as f: | |
print_pages(sys.argv[1], f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment