dmgig · September 21, 2021 01:27
diff --git a/ocr.py b/ocr.py
 #!/usr/bin/python3

 import os
 import sys
 from datetime import datetime
 import pytz
 import logging
 import ohhocr
 import getopt
 import argparse

 estTZ = pytz.timezone('US/Eastern')

 LOGSDIR = '_logs'

 if not os.path.exists('./'+LOGSDIR):
    os.mkdir('./'+LOGSDIR)

 logging.basicConfig(filename='./'+LOGSDIR+'/'+datetime.now(estTZ).strftime("%Y%m%d%H%M%S")+'.log', level=logging.DEBUG)


 # CONF
 TESSCHUNK = 4
 TEXTCHUNK = 6
 CONVCHUNK = 6
 SLEEPTIME = 0.05;

 try:
  opts, args = getopt.getopt(sys.argv[1:], "ht:s:", ["tesschunk=","sleeptime="])
 except getopt.GetoptError:
  print('python3 bin/ocr.py --sleeptime 0.05 --tesschunk 4')
  sys.exit(2)
 for opt, arg in opts:
  if opt == '-h':
     print('python3 bin/ocr.py --sleeptime 0.05 --tesschunk 4')
     sys.exit()
  elif opt in ("-t", "--tesschunk"):
     TESSCHUNK = arg
  elif opt in ("-s", "--sleeptime"):
     SLEEPTIME = arg

 if sys.argv[1]:
    PATH = sys.argv[1]
    logging.info(PATH)
    ohhocr.main(PATH, TESSCHUNK, SLEEPTIME)
 else:
    error = "No path provided."
    logging.error(error)
    print(error)
    exit(1)
diff --git a/ohhocr.py b/ohhocr.py
 #!/usr/bin/python3

 import os
 import sys
 import getopt
 import subprocess
 from datetime import datetime
 import time
 import pytesseract
 import argparse
 import cv2
 import shutil
 import logging
 import glob
 import logging
 import math
 import calendar
 from subprocess import Popen
 from PIL import Image
 from time import sleep
 import re
 import pytz
 from PyPDF2 import PdfFileReader
 import pikepdf
 import getopt

 estTZ = pytz.timezone('US/Eastern')

 documentsTotal = 0;
 documentCounter = 0;
 timeStart = calendar.timegm(time.gmtime());

 # FUNCS

 LOGSDIR = '_logs'
 IMGSDIR = '_imgs'
 TEXTDIR = '_text'
 DONEDIR = '_done'

 def setUpWorkingDirectory(dir):
    if not os.path.exists(dir+'/'+IMGSDIR):
        os.mkdir(dir+'/'+IMGSDIR)
    if not os.path.exists(dir+'/'+TEXTDIR):
        os.mkdir(dir+'/'+TEXTDIR)
    if not os.path.exists(dir+'/'+DONEDIR):
        os.mkdir(dir+'/'+DONEDIR)

 def doDirectory(dir):
    prepFileDirs(dir,'pdf')

 def doSingleFile(file):
    dir = os.path.dirname(file)
    prepPdf(file, dir, dir+'/'+IMGSDIR, dir+'/'+TEXTDIR, dir+'/'+DONEDIR)

 def moveEsFiles(dir):
    print('moving dir '+dir)
    es_dir = dir.replace('docs_originals/', 'es/')
    if not os.path.exists(es_dir):
        os.makedirs(es_dir, exist_ok=True)
    os.system('rsync -pruv '+dir+'/'+TEXTDIR+'/ '+es_dir)

 def moveDoneFiles(dir):
    for doneDir in glob.glob(dir+'/**/_done', recursive=True):
        print("------------------"+doneDir)
        os.system('mv -v '+doneDir+'/* '+doneDir+'/../')
    os.system('mv -v '+dir+'/_done/* '+dir+'/_done/../')

 def cleanUp(dir):
    shutil.rmtree(dir+'/_imgs')
    shutil.rmtree(dir+'/_text')
    shutil.rmtree(dir+'/_done')
    for imgsDir in glob.glob(dir+'/**/_imgs', recursive=True):
        shutil.rmtree(imgsDir)
    for textDir in glob.glob(dir+'/**/_text', recursive=True):
        shutil.rmtree(textDir)
    for doneDir in glob.glob(dir+'/**/_done', recursive=True):
        shutil.rmtree(doneDir)

 # time conv https://stackoverflow.com/questions/4048651/python-function-to-convert-seconds-into-minutes-hours-and-days/4048773
 def dhmsTime(q):
  days = divmod(q, 86400)
  # days[0] = whole days and
  # days[1] = seconds remaining after those days
  hours = divmod(days[1], 3600)
  minutes = divmod(hours[1], 60)
  return "%i days, %i hours, %i minutes, %i seconds" % (days[0], hours[0], minutes[0], minutes[1])

 # https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
 def chunks(l, n):
  """Yield successive n-sized chunks from l."""
  for i in range(0, len(l), n):
    yield l[i:i + n]

 # loop through files in dir
 def listFiles(dir):
  basedir = dir
  print(dir)
  print("=========================")
  for pdf in glob.glob(dir+'/*.pdf'):
    if os.path.isfile(pdf):
      print(os.path.basename(pdf))

 def prepFileDirs(dir,type):
  global documentsTotal
  basedir = dir
  files = glob.glob(dir+'/*')
  natural_sort(files)
  documentsTotal = len(files)
  for file in files:
    if os.path.isfile(file):
      fileName = os.path.basename(file)
      dirName = os.path.dirname(file)
      dirMain = dir
      dirImgs = dir+'/'+IMGSDIR
      dirText = dir+'/'+TEXTDIR
      dirDone = dir+'/'+DONEDIR
      prepPdf(file, dirName, dirImgs, dirText, dirDone)
    else:
      if(os.path.basename(file) not in ['_imgs', '_text', '_done']):
        setUpWorkingDirectory(file)
        doDirectory(file)
        moveEsFiles(file)

 def appendToFilename(filePath, append):
  dirName = os.path.dirname(filePath)+'/'
  baseName = os.path.basename(filePath)
  appendedName = dirName + os.path.splitext(baseName)[0] + append + os.path.splitext(baseName)[1]
  return appendedName

 def prepPdf(pdfPath, dirName, imgPath, textPath, dirDone):
  global documentsTotal
  global documentCounter
  global timeStart
  global SLEEPTIME
  global TESSCHUNK

  print(makeLayout('=', len(pdfPath)))
  print(pdfPath)
  print(makeLayout('=', len(pdfPath)))
  # convert pdf to png
  # with open(pdfPath, "rb") as pdf_file:
    # pdf = pikepdf.open(pdf_file)
    # pdf.save(pdf)
    # pdf_reader = PdfFileReader(pdf_file, False)
  print(f"Ghostscript running...")
  subprocess.call(['gs', '-dNOPAUSE', '-dBATCH', '-dTextAlphaBits=4', '-dGraphicsAlphaBits=4', '-sDEVICE=pnggray', '-r300x300', '-dNumRenderingThreads=4', '-dBufferSpace=2000000000', '-sCompression=none', '-sOutputFile='+imgPath+'/'+os.path.splitext(os.path.basename(pdfPath))[0]+'_%d.png', pdfPath ])

  timeStartDoc = calendar.timegm(time.gmtime());
  millitimeStart = time.time()
  imgFiles = glob.glob(imgPath+'/*.png')
  imgFilesCnt = len(imgFiles)

  # remove gray backgrounds
 #   imgFilesChunks = chunks(imgFiles, CONVCHUNK)
 #   i=0
 #   chunksSize = math.ceil(imgFilesCnt / CONVCHUNK);
 #   for chunk in imgFilesChunks:
 #     print("Chunk %d of %d" % (i+1,chunksSize))
 #     ps = {}
 #     for file in chunk:
 #       baseName = os.path.basename(file)
 #       p = subprocess.Popen(['convert', file, '-set', 'colorspace', 'gray', '-contrast-stretch', '4x80%', file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 #       ps[p.pid] = p
 #     print("Waiting for %d convert processes..." % len(ps))
 #     while ps:
 #       pid, status = os.wait()
 #       if pid in ps:
 #         del ps[pid]
 #         print("Waiting for %d convert processes..." % len(ps))
 #     i=i+1

  # image enhancement/restoration
 #   imgFiles = glob.glob(imgPath+'/*.png')
 #   imgFilesChunks = chunks(imgFiles, TEXTCHUNK)
 #   i=0
 #   chunksSize = math.ceil(imgFilesCnt / TEXTCHUNK)
 #   for chunk in imgFilesChunks:
 #     print("Chunk %d of %d" % (i+1,chunksSize))
 #     ps = {}
 #     for file in chunk:
 #       baseName = os.path.basename(file)
 # #       p = subprocess.Popen(['textcleaner', '-u', '-T', '-p', '20', file, file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 #       p = subprocess.Popen(['textcleaner', '-g', '-e', 'none', '-f', '10', '-o', '5', file, 'x'+file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 #       ps[p.pid] = p
 #       sleep(0.05)
 #     print("Waiting for %d textcleaner processes..." % len(ps))
 #     while ps:
 #       pid, status = os.wait()
 #       if pid in ps:
 #         del ps[pid]
 #         print("Waiting for %d textcleaner processes..." % len(ps))
 #     i=i+1

  # tesseract (3 processes)
  # https://stackoverflow.com/questions/3194018/wait-the-end-of-subprocesses-with-multiple-parallel-jobs
  imgFiles = glob.glob(imgPath+'/*.png')
  imgFilesChunks = chunks(imgFiles, TESSCHUNK)
  i=0
  chunksSize = math.ceil(imgFilesCnt / TESSCHUNK)
  print(makeLayout('=', len(pdfPath)))
  print(pdfPath)
  print(makeLayout('=', len(pdfPath)))
  for chunk in imgFilesChunks:
    ps = {}
    for file in chunk:
      baseName = os.path.basename(file)
      p = subprocess.Popen(['tesseract', file, textPath+'/'+os.path.splitext(baseName)[0], '-v', '-l', 'eng'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
      ps[p.pid] = p
      sleep(SLEEPTIME)
    while ps:
        pid, status = os.wait()
        if pid in ps:
            i = i+1
            printProgressBar(i, len(imgFiles), prefix = 'Tesseract running, ', suffix = 'complete')
            del ps[pid]

  shutil.rmtree(dirName+'/'+IMGSDIR)
  os.mkdir(dirName+'/'+IMGSDIR)

  # move finished pdf
  shutil.move(pdfPath, dirDone+'/');
  documentCounter = documentCounter + 1;
  curTime = calendar.timegm(time.gmtime());
  print(datetime.now(estTZ).strftime("%Y-%m-%d %H:%M:%S"))
  runningTimeString = dhmsTime(curTime - timeStart)
  print(runningTimeString)
  documentTimeString = dhmsTime(curTime - timeStartDoc)
  print(documentTimeString)
  if len(imgFiles):
    logging.info(float(time.time() - millitimeStart) / len(imgFiles))
  print("Completed %d documents in %s" % (documentCounter, runningTimeString))
  print("Documents remaining: %s" % (documentsTotal - documentCounter))
  print("Last doc took %s\n\n" % documentTimeString)

 def natural_sort(l):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
    return sorted(l, key=alphanum_key)

 def makeLayout(char, n):
    lay = ''
    while n:
        lay = lay + char
        n = n - 1
    return lay

 # Print iterations progress
 # https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console
 def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total:
        print()

 # WORK

 TESSCHUNK=4
 SLEEPTIME=0.05

 def main(PATH, iTESSCHUNK, iSLEEPTIME):

    TESSCHUNK=iTESSCHUNK
    SLEEPTIME=iSLEEPTIME

    if os.path.exists(PATH):
        if os.path.isdir(PATH):
            WORKINGDIR = PATH
            setUpWorkingDirectory(WORKINGDIR)
            listFiles(WORKINGDIR)
            doDirectory(WORKINGDIR)
        else:
            WORKINGDIR = os.path.dirname(PATH)
            setUpWorkingDirectory(WORKINGDIR)
            doSingleFile(PATH)
        moveDoneFiles(WORKINGDIR)
        cleanUp(WORKINGDIR)
    else:
        error = "Path does not exist."
        logging.error(error)
        print(error)
        exit(1)
	#!/usr/bin/python3

	import os
	import sys
	from datetime import datetime
	import pytz
	import logging
	import ohhocr
	import getopt
	import argparse

	estTZ = pytz.timezone('US/Eastern')

	LOGSDIR = '_logs'

	if not os.path.exists('./'+LOGSDIR):
	os.mkdir('./'+LOGSDIR)

	logging.basicConfig(filename='./'+LOGSDIR+'/'+datetime.now(estTZ).strftime("%Y%m%d%H%M%S")+'.log', level=logging.DEBUG)


	# CONF
	TESSCHUNK = 4
	TEXTCHUNK = 6
	CONVCHUNK = 6
	SLEEPTIME = 0.05;

	try:
	opts, args = getopt.getopt(sys.argv[1:], "ht:s:", ["tesschunk=","sleeptime="])
	except getopt.GetoptError:
	print('python3 bin/ocr.py --sleeptime 0.05 --tesschunk 4')
	sys.exit(2)
	for opt, arg in opts:
	if opt == '-h':
	print('python3 bin/ocr.py --sleeptime 0.05 --tesschunk 4')
	sys.exit()
	elif opt in ("-t", "--tesschunk"):
	TESSCHUNK = arg
	elif opt in ("-s", "--sleeptime"):
	SLEEPTIME = arg

	if sys.argv[1]:
	PATH = sys.argv[1]
	logging.info(PATH)
	ohhocr.main(PATH, TESSCHUNK, SLEEPTIME)
	else:
	error = "No path provided."
	logging.error(error)
	print(error)
	exit(1)