Created
April 13, 2021 05:17
-
-
Save llermaly/f6aa0a351b74b98901e0fdd7e4fc7dae to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import cv2 | |
import pytesseract | |
import re | |
import json | |
import numpy as np | |
import glob | |
import csv | |
from datetime import datetime | |
# Test image set with all tesseract algorithms | |
# Put you testing files under images folder | |
# Define the columns you want to test and then the regex to grab that fields | |
# Output is a CSV file | |
# Page segmentation modes: | |
# 0 Orientation and script detection (OSD) only. | |
# 1 Automatic page segmentation with OSD. | |
# 2 Automatic page segmentation, but no OSD, or OCR. (not implemented) | |
# 3 Fully automatic page segmentation, but no OSD. (Default) | |
# 4 Assume a single column of text of variable sizes. | |
# 5 Assume a single uniform block of vertically aligned text. | |
# 6 Assume a single uniform block of text. | |
# 7 Treat the image as a single text line. | |
# 8 Treat the image as a single word. | |
# 9 Treat the image as a single word in a circle. | |
# 10 Treat the image as a single character. | |
# 11 Sparse text. Find as much text as possible in no particular order. | |
# 12 Sparse text with OSD. | |
# 13 Raw line. Treat the image as a single text line, | |
# bypassing hacks that are Tesseract-specific. | |
def getHeaders(): | |
headers = ['file'] | |
# skip method 2 because does not support OCR | |
for n in [x for x in range(14) if x != 2]: | |
# here you can add more fields to extract | |
headers.append('invoice_psm_'+str(n)) | |
headers.append('verification_psm_'+str(n)) | |
return headers | |
def getRow(file): | |
columns = [file.split("/")[1]] | |
image = cv2.imread(file) | |
warped_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
(thresh, im_bw) = cv2.threshold(warped_image, 128, | |
255, cv2.THRESH_BINARY) | |
# skip method 2 because does not support OCR | |
for n in [x for x in range(14) if x != 2]: | |
# as 0 does not support OSD we replace it with default search | |
psm = "--psm "+str(n) if n > 0 else "" | |
config_str = "-l eng "+psm | |
print(str(datetime.now()) + " Trying extraction of file " + | |
file.split("/")[1] + " with " + config_str) | |
text = pytesseract.image_to_string( | |
im_bw, config=config_str) | |
# here you can add more fields to extract | |
invoice = re.search("[0-9]{8,12}", text) | |
verification = re.search("(?=.*[A-Za-z])(?=.*\d)[A-Za-z\d]{12}", text) | |
columns.append(invoice.group() if invoice else "") | |
columns.append(verification.group() if verification else "") | |
return columns | |
def start(): | |
print("start: "+str(datetime.now())) | |
with open(str(datetime.now())+' folios_check.csv', 'w', newline='') as file: | |
writer = csv.writer(file) | |
writer.writerow(getHeaders()) | |
for filepath in glob.iglob('images/*.jpg'): | |
writer.writerow(getRow(filepath)) | |
print("end: "+str(datetime.now())) | |
start() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment