Created
June 11, 2023 15:36
-
-
Save naytseyd/c1968ce476238805af006710878cd8ce to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from os import listdir, path, remove | |
from re import findall | |
from threading import Lock, Thread | |
import pytesseract | |
from PIL import Image | |
class ImageProcessor: | |
def __init__(self, image_folder, output_file): | |
self.image_folder = image_folder | |
self.output_file = output_file | |
self.lock = Lock() | |
def process_image(self, image_path): | |
image = Image.open(image_path) | |
text = pytesseract.image_to_string(image, lang='eng') | |
return text | |
def process_image_thread(self, image_path): | |
text = self.process_image(image_path) | |
filtered_text = findall(r'\b64[A-Za-z0-9]{22}\b|\b\d{7}\b', text) | |
if filtered_text: | |
with self.lock: | |
with open(self.output_file, 'a') as f: | |
for item in filtered_text: | |
f.write(f'{item}\n') | |
remove(image_path) | |
else: | |
print(f'Text not found - {image_path}') | |
def process_images(self): | |
image_files = [ | |
f | |
for f in listdir(self.image_folder) | |
if f.endswith(('.png', '.jpg', '.jpeg')) | |
] | |
threads = [] | |
for image_file in image_files: | |
image_path = path.join(self.image_folder, image_file) | |
thread = Thread(target=self.process_image_thread, args=(image_path,)) | |
thread.start() | |
threads.append(thread) | |
for thread in threads: | |
thread.join() | |
print(f'Output file generated - {self.output_file}') | |
if __name__ == '__main__': | |
image_folder = '/path/to/folder' | |
output_file = 'ocr_output.txt' | |
processor = ImageProcessor(image_folder, output_file) | |
processor.process_images() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment