Skip to content

Instantly share code, notes, and snippets.

@Su-s
Created December 13, 2018 12:13
Show Gist options
  • Save Su-s/ea53d8ce43fe85665e5ee38f5b1f26bf to your computer and use it in GitHub Desktop.
Save Su-s/ea53d8ce43fe85665e5ee38f5b1f26bf to your computer and use it in GitHub Desktop.
import io
from PIL import Image
from fpdf import FPDF
from wand.image import Image as wi
import cv2
import numpy as np
import tesserocr as tr
import os
api = tr.PyTessBaseAPI()
try:
pdf = wi(filename = "abc.pdf", resolution = 300)
pdfImage = pdf.convert('jpeg')
imageBlobs = []
for img in pdfImage.sequence:
imgPage = wi(image = img)
imageBlobs.append(imgPage.make_blob('jpeg'))
recognized_text = []
box_list = {}
count = 0
for imgBlob in imageBlobs:
im = Image.open(io.BytesIO(imgBlob))
api.SetImage(im)
boxes = api.GetComponentImages(tr.RIL.TEXTLINE,True)
text = api.GetUTF8Text()
cv_img = np.array(im)
for (im,box,_,_) in boxes:
x,y,w,h = box['x'],box['y'],box['w'],box['h']
cv2.rectangle(cv_img, (x,y), (x+w,y+h), color=(0,255,0))
fname = 'result'+str(count)+'.png'
cv2.imwrite(fname, cv_img)
count += 1
pdf = FPDF()
for file in os.listdir():
if file.endswith(".png") or file.endswith(".PNG"):
img=os.path.join(file)
pdf.add_page()
pdf.set_font('Arial', 'B', 16)
pdf.image('%s'%img, 3,3,204 )
os.remove('%s'%img)
pdf.output('processed.pdf')
except Exception as e:
print(e)
finally:
api.End()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment