Skip to content

Instantly share code, notes, and snippets.

@llermaly
Created April 13, 2021 05:17
Show Gist options
  • Save llermaly/f6aa0a351b74b98901e0fdd7e4fc7dae to your computer and use it in GitHub Desktop.
Save llermaly/f6aa0a351b74b98901e0fdd7e4fc7dae to your computer and use it in GitHub Desktop.
import sys
import cv2
import pytesseract
import re
import json
import numpy as np
import glob
import csv
from datetime import datetime
# Test image set with all tesseract algorithms
# Put you testing files under images folder
# Define the columns you want to test and then the regex to grab that fields
# Output is a CSV file
# Page segmentation modes:
# 0 Orientation and script detection (OSD) only.
# 1 Automatic page segmentation with OSD.
# 2 Automatic page segmentation, but no OSD, or OCR. (not implemented)
# 3 Fully automatic page segmentation, but no OSD. (Default)
# 4 Assume a single column of text of variable sizes.
# 5 Assume a single uniform block of vertically aligned text.
# 6 Assume a single uniform block of text.
# 7 Treat the image as a single text line.
# 8 Treat the image as a single word.
# 9 Treat the image as a single word in a circle.
# 10 Treat the image as a single character.
# 11 Sparse text. Find as much text as possible in no particular order.
# 12 Sparse text with OSD.
# 13 Raw line. Treat the image as a single text line,
# bypassing hacks that are Tesseract-specific.
def getHeaders():
headers = ['file']
# skip method 2 because does not support OCR
for n in [x for x in range(14) if x != 2]:
# here you can add more fields to extract
headers.append('invoice_psm_'+str(n))
headers.append('verification_psm_'+str(n))
return headers
def getRow(file):
columns = [file.split("/")[1]]
image = cv2.imread(file)
warped_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
(thresh, im_bw) = cv2.threshold(warped_image, 128,
255, cv2.THRESH_BINARY)
# skip method 2 because does not support OCR
for n in [x for x in range(14) if x != 2]:
# as 0 does not support OSD we replace it with default search
psm = "--psm "+str(n) if n > 0 else ""
config_str = "-l eng "+psm
print(str(datetime.now()) + " Trying extraction of file " +
file.split("/")[1] + " with " + config_str)
text = pytesseract.image_to_string(
im_bw, config=config_str)
# here you can add more fields to extract
invoice = re.search("[0-9]{8,12}", text)
verification = re.search("(?=.*[A-Za-z])(?=.*\d)[A-Za-z\d]{12}", text)
columns.append(invoice.group() if invoice else "")
columns.append(verification.group() if verification else "")
return columns
def start():
print("start: "+str(datetime.now()))
with open(str(datetime.now())+' folios_check.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(getHeaders())
for filepath in glob.iglob('images/*.jpg'):
writer.writerow(getRow(filepath))
print("end: "+str(datetime.now()))
start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment