llermaly · April 13, 2021 05:17
diff --git a/tesseract-rank.py b/tesseract-rank.py
 import sys
 import cv2
 import pytesseract
 import re
 import json
 import numpy as np
 import glob
 import csv
 from datetime import datetime

 # Test image set with all tesseract algorithms
 # Put you testing files under images folder
 # Define the columns you want to test and then the regex to grab that fields
 # Output is a CSV file

 # Page segmentation modes:
 #   0    Orientation and script detection (OSD) only.
 #   1    Automatic page segmentation with OSD.
 #   2    Automatic page segmentation, but no OSD, or OCR. (not implemented)
 #   3    Fully automatic page segmentation, but no OSD. (Default)
 #   4    Assume a single column of text of variable sizes.
 #   5    Assume a single uniform block of vertically aligned text.
 #   6    Assume a single uniform block of text.
 #   7    Treat the image as a single text line.
 #   8    Treat the image as a single word.
 #   9    Treat the image as a single word in a circle.
 #  10    Treat the image as a single character.
 #  11    Sparse text. Find as much text as possible in no particular order.
 #  12    Sparse text with OSD.
 #  13    Raw line. Treat the image as a single text line,
 #        bypassing hacks that are Tesseract-specific.


 def getHeaders():
    headers = ['file']
    # skip method 2 because does not support OCR
    for n in [x for x in range(14) if x != 2]:
        # here you can add more fields to extract
        headers.append('invoice_psm_'+str(n))
        headers.append('verification_psm_'+str(n))
    return headers


 def getRow(file):
    columns = [file.split("/")[1]]
    image = cv2.imread(file)
    warped_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    (thresh, im_bw) = cv2.threshold(warped_image, 128,
                                    255, cv2.THRESH_BINARY)
    # skip method 2 because does not support OCR
    for n in [x for x in range(14) if x != 2]:
        # as 0 does not support OSD we replace it with default search
        psm = "--psm "+str(n) if n > 0 else ""
        config_str = "-l eng "+psm

        print(str(datetime.now()) + " Trying extraction of file " +
              file.split("/")[1] + " with " + config_str)
        text = pytesseract.image_to_string(
            im_bw, config=config_str)

        # here you can add more fields to extract

        invoice = re.search("[0-9]{8,12}", text)
        verification = re.search("(?=.*[A-Za-z])(?=.*\d)[A-Za-z\d]{12}", text)

        columns.append(invoice.group() if invoice else "")
        columns.append(verification.group() if verification else "")
    return columns


 def start():
    print("start: "+str(datetime.now()))
    with open(str(datetime.now())+' folios_check.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(getHeaders())
        for filepath in glob.iglob('images/*.jpg'):
            writer.writerow(getRow(filepath))
    print("end: "+str(datetime.now()))


 start()
	import sys
	import cv2
	import pytesseract
	import re
	import json
	import numpy as np
	import glob
	import csv
	from datetime import datetime

	# Test image set with all tesseract algorithms
	# Put you testing files under images folder
	# Define the columns you want to test and then the regex to grab that fields
	# Output is a CSV file

	# Page segmentation modes:
	# 0 Orientation and script detection (OSD) only.
	# 1 Automatic page segmentation with OSD.
	# 2 Automatic page segmentation, but no OSD, or OCR. (not implemented)
	# 3 Fully automatic page segmentation, but no OSD. (Default)
	# 4 Assume a single column of text of variable sizes.
	# 5 Assume a single uniform block of vertically aligned text.
	# 6 Assume a single uniform block of text.
	# 7 Treat the image as a single text line.
	# 8 Treat the image as a single word.
	# 9 Treat the image as a single word in a circle.
	# 10 Treat the image as a single character.
	# 11 Sparse text. Find as much text as possible in no particular order.
	# 12 Sparse text with OSD.
	# 13 Raw line. Treat the image as a single text line,
	# bypassing hacks that are Tesseract-specific.


	def getHeaders():
	headers = ['file']
	# skip method 2 because does not support OCR
	for n in [x for x in range(14) if x != 2]:
	# here you can add more fields to extract
	headers.append('invoice_psm_'+str(n))
	headers.append('verification_psm_'+str(n))
	return headers


	def getRow(file):
	columns = [file.split("/")[1]]
	image = cv2.imread(file)
	warped_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	(thresh, im_bw) = cv2.threshold(warped_image, 128,
	255, cv2.THRESH_BINARY)
	# skip method 2 because does not support OCR
	for n in [x for x in range(14) if x != 2]:
	# as 0 does not support OSD we replace it with default search
	psm = "--psm "+str(n) if n > 0 else ""
	config_str = "-l eng "+psm

	print(str(datetime.now()) + " Trying extraction of file " +
	file.split("/")[1] + " with " + config_str)
	text = pytesseract.image_to_string(
	im_bw, config=config_str)

	# here you can add more fields to extract

	invoice = re.search("[0-9]{8,12}", text)
	verification = re.search("(?=.[A-Za-z])(?=.\d)[A-Za-z\d]{12}", text)

	columns.append(invoice.group() if invoice else "")
	columns.append(verification.group() if verification else "")
	return columns


	def start():
	print("start: "+str(datetime.now()))
	with open(str(datetime.now())+' folios_check.csv', 'w', newline='') as file:
	writer = csv.writer(file)
	writer.writerow(getHeaders())
	for filepath in glob.iglob('images/*.jpg'):
	writer.writerow(getRow(filepath))
	print("end: "+str(datetime.now()))


	start()