Skip to content

Instantly share code, notes, and snippets.

@naytseyd
Created June 11, 2023 15:36
Show Gist options
  • Save naytseyd/c1968ce476238805af006710878cd8ce to your computer and use it in GitHub Desktop.
Save naytseyd/c1968ce476238805af006710878cd8ce to your computer and use it in GitHub Desktop.
from os import listdir, path, remove
from re import findall
from threading import Lock, Thread
import pytesseract
from PIL import Image
class ImageProcessor:
def __init__(self, image_folder, output_file):
self.image_folder = image_folder
self.output_file = output_file
self.lock = Lock()
def process_image(self, image_path):
image = Image.open(image_path)
text = pytesseract.image_to_string(image, lang='eng')
return text
def process_image_thread(self, image_path):
text = self.process_image(image_path)
filtered_text = findall(r'\b64[A-Za-z0-9]{22}\b|\b\d{7}\b', text)
if filtered_text:
with self.lock:
with open(self.output_file, 'a') as f:
for item in filtered_text:
f.write(f'{item}\n')
remove(image_path)
else:
print(f'Text not found - {image_path}')
def process_images(self):
image_files = [
f
for f in listdir(self.image_folder)
if f.endswith(('.png', '.jpg', '.jpeg'))
]
threads = []
for image_file in image_files:
image_path = path.join(self.image_folder, image_file)
thread = Thread(target=self.process_image_thread, args=(image_path,))
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
print(f'Output file generated - {self.output_file}')
if __name__ == '__main__':
image_folder = '/path/to/folder'
output_file = 'ocr_output.txt'
processor = ImageProcessor(image_folder, output_file)
processor.process_images()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment