shion24hub · November 22, 2023 13:19
diff --git a/main.py b/main.py
 from ocre import OCRElaborator

 if __name__ == '__main__':
    img_paths = ['test.png', 'test2.png']
    elaborator = OCRElaborator(img_paths)
    print(elaborator.run())
diff --git a/ocre.py b/ocre.py
 '''
 Use Tesseract as an OCR engine.
 '''

 import pytesseract


 class OCRElaborator:
    
    # TODO: allows user to specify methods to be applied.
    available_methods = [
        'delete_duplicated_CR',
        'cancel_hyphenation',
    ]

    def __init__(self, img_paths: list[str]) -> None:
        self.img_paths = img_paths
    
    @staticmethod
    def __ocr(img_path: str) -> str:
        return pytesseract.image_to_string(img_path, lang='eng')
    
    @staticmethod
    def __delete_duplicated_CR(sentence: str) -> str:
        proced_sentence = ''
        for i in range(len(sentence)):
            if sentence[i-1] == '\n' and sentence[i] == '\n':
                continue
            else:
                proced_sentence += sentence[i]
        
        return proced_sentence

    @staticmethod
    def __cancel_hyphenation(sentence: str) -> str:
        proced_sentence = ''
        for i in range(len(sentence)):
            if sentence[i-1] == '-' and sentence[i] == '\n':
                proced_sentence = proced_sentence[:-1]
            elif sentence[i] == '\n':
                proced_sentence += ' '
            else:
                proced_sentence += sentence[i]
        
        return proced_sentence

    @staticmethod
    def __link(sentences: str) -> str:
        return ' '.join(sentences)
    
    def run(self) -> str:
        proced_sentences = []
        for img_path in self.img_paths:
            sentence = self.__ocr(img_path)
            sentence = self.__delete_duplicated_CR(sentence)
            sentence = self.__cancel_hyphenation(sentence)

            proced_sentences.append(sentence)
        
        return self.__link(proced_sentences)
	from ocre import OCRElaborator

	if __name__ == '__main__':
	img_paths = ['test.png', 'test2.png']
	elaborator = OCRElaborator(img_paths)
	print(elaborator.run())
	'''
	Use Tesseract as an OCR engine.
	'''

	import pytesseract


	class OCRElaborator:

	# TODO: allows user to specify methods to be applied.
	available_methods = [
	'delete_duplicated_CR',
	'cancel_hyphenation',
	]

	def __init__(self, img_paths: list[str]) -> None:
	self.img_paths = img_paths

	@staticmethod
	def __ocr(img_path: str) -> str:
	return pytesseract.image_to_string(img_path, lang='eng')

	@staticmethod
	def __delete_duplicated_CR(sentence: str) -> str:
	proced_sentence = ''
	for i in range(len(sentence)):
	if sentence[i-1] == '\n' and sentence[i] == '\n':
	continue
	else:
	proced_sentence += sentence[i]

	return proced_sentence

	@staticmethod
	def __cancel_hyphenation(sentence: str) -> str:
	proced_sentence = ''
	for i in range(len(sentence)):
	if sentence[i-1] == '-' and sentence[i] == '\n':
	proced_sentence = proced_sentence[:-1]
	elif sentence[i] == '\n':
	proced_sentence += ' '
	else:
	proced_sentence += sentence[i]

	return proced_sentence

	@staticmethod
	def __link(sentences: str) -> str:
	return ' '.join(sentences)

	def run(self) -> str:
	proced_sentences = []
	for img_path in self.img_paths:
	sentence = self.__ocr(img_path)
	sentence = self.__delete_duplicated_CR(sentence)
	sentence = self.__cancel_hyphenation(sentence)

	proced_sentences.append(sentence)

	return self.__link(proced_sentences)