Luxter77 · May 3, 2025 01:04
diff --git a/ruido_de_mate.py b/ruido_de_mate.py
 #!/usr/bin/env python3

 from genericpath import isfile
 from pdf2image import convert_from_path, pdfinfo_from_path
 import collections, sys, os


 from tqdm.auto import tqdm, trange
 from glob import glob

 try:
    from PIL import Image
 except ImportError:
    import Image
 import pytesseract

 try:
    pdffile = input("Pdf file to scan: ")
 except KeyboardInterrupt:
    sys.exit()

 if 'pdf' in pdffile.lower():
    outf = pdffile.lower().replace('pdf', 'txt').replace('*', '')
 else:
    outf = pdffile + ('' if 'txt' in pdffile else '.txt')
 out = open(outf, "w", encoding='utf-8')

 from scipy.ndimage import interpolation as inter

 import numpy as np
 import cv2

 try:
    n_pages = pdfinfo_from_path(pdffile, userpw=None, poppler_path=None)['Pages']
 except Exception:
    n_pages = len(glob(pdffile))

 print('Total pages:', n_pages)
 print('In file:', pdffile)
 print('Out file:', outf)

 def delineate(s: str) -> str:
    out = ''
    for line in s.replace('\r\n', '\n').split('\n'):
        if line.strip() == '': out = out + '\n\n\n'
        if line.endswith('-') or line.endswith('—'):
            out = (out + line[:-1]).strip()
        else:
            out = (out + line).strip() + ' '
    return out

 def correct_skew(image, delta=1, limit=5):
    def determine_score(arr, angle):
        data = inter.rotate(arr, angle, reshape=False, order=0)
        histogram = np.sum(data, axis=1, dtype=float)
        score = np.sum((histogram[1:] - histogram[:-1]) ** 2, dtype=float)
        return histogram, score

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] 

    scores = []
    angles = np.arange(-limit, limit + delta, delta)
    for angle in angles:
        histogram, score = determine_score(thresh, angle)
        scores.append(score)

    best_angle = angles[scores.index(max(scores))]

    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, best_angle, 1.0)
    corrected = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, \
            borderMode=cv2.BORDER_REPLICATE)

    return best_angle, corrected

 def print_pages(pdf_file):
    for page_n in trange(1, n_pages + 1, 1, desc=f'Processing book: {pdffile}'):
        (image,) = convert_from_path(pdf_file, first_page=page_n, last_page=page_n)
        _, image = correct_skew(image)
        d = pytesseract.image_to_string(image)
        out.write(d)
        tqdm.write(d)

 def print_images():
    for page_n   in range(1, 604):
        if isfile(f'{page_n}.jpg'):
            with Image.open(f'{page_n}.jpg') as image:
                d = (pytesseract.image_to_string(image)) + f'\n\n-- page {page_n} --\n\n'
                out.write(d)
                tqdm.write(d)

 print_images()
	#!/usr/bin/env python3

	from genericpath import isfile
	from pdf2image import convert_from_path, pdfinfo_from_path
	import collections, sys, os


	from tqdm.auto import tqdm, trange
	from glob import glob

	try:
	from PIL import Image
	except ImportError:
	import Image
	import pytesseract

	try:
	pdffile = input("Pdf file to scan: ")
	except KeyboardInterrupt:
	sys.exit()

	if 'pdf' in pdffile.lower():
	outf = pdffile.lower().replace('pdf', 'txt').replace('*', '')
	else:
	outf = pdffile + ('' if 'txt' in pdffile else '.txt')
	out = open(outf, "w", encoding='utf-8')

	from scipy.ndimage import interpolation as inter

	import numpy as np
	import cv2

	try:
	n_pages = pdfinfo_from_path(pdffile, userpw=None, poppler_path=None)['Pages']
	except Exception:
	n_pages = len(glob(pdffile))

	print('Total pages:', n_pages)
	print('In file:', pdffile)
	print('Out file:', outf)

	def delineate(s: str) -> str:
	out = ''
	for line in s.replace('\r\n', '\n').split('\n'):
	if line.strip() == '': out = out + '\n\n\n'
	if line.endswith('-') or line.endswith('—'):
	out = (out + line[:-1]).strip()
	else:
	out = (out + line).strip() + ' '
	return out

	def correct_skew(image, delta=1, limit=5):
	def determine_score(arr, angle):
	data = inter.rotate(arr, angle, reshape=False, order=0)
	histogram = np.sum(data, axis=1, dtype=float)
	score = np.sum((histogram[1:] - histogram[:-1]) ** 2, dtype=float)
	return histogram, score

	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

	scores = []
	angles = np.arange(-limit, limit + delta, delta)
	for angle in angles:
	histogram, score = determine_score(thresh, angle)
	scores.append(score)

	best_angle = angles[scores.index(max(scores))]

	(h, w) = image.shape[:2]
	center = (w // 2, h // 2)
	M = cv2.getRotationMatrix2D(center, best_angle, 1.0)
	corrected = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, \
	borderMode=cv2.BORDER_REPLICATE)

	return best_angle, corrected

	def print_pages(pdf_file):
	for page_n in trange(1, n_pages + 1, 1, desc=f'Processing book: {pdffile}'):
	(image,) = convert_from_path(pdf_file, first_page=page_n, last_page=page_n)
	_, image = correct_skew(image)
	d = pytesseract.image_to_string(image)
	out.write(d)
	tqdm.write(d)

	def print_images():
	for page_n in range(1, 604):
	if isfile(f'{page_n}.jpg'):
	with Image.open(f'{page_n}.jpg') as image:
	d = (pytesseract.image_to_string(image)) + f'\n\n-- page {page_n} --\n\n'
	out.write(d)
	tqdm.write(d)

	print_images()