Last active
May 3, 2025 01:04
-
-
Save Luxter77/06c6183d596b6a9d631a43f8a1b8233d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from genericpath import isfile | |
from pdf2image import convert_from_path, pdfinfo_from_path | |
import collections, sys, os | |
from tqdm.auto import tqdm, trange | |
from glob import glob | |
try: | |
from PIL import Image | |
except ImportError: | |
import Image | |
import pytesseract | |
try: | |
pdffile = input("Pdf file to scan: ") | |
except KeyboardInterrupt: | |
sys.exit() | |
if 'pdf' in pdffile.lower(): | |
outf = pdffile.lower().replace('pdf', 'txt').replace('*', '') | |
else: | |
outf = pdffile + ('' if 'txt' in pdffile else '.txt') | |
out = open(outf, "w", encoding='utf-8') | |
from scipy.ndimage import interpolation as inter | |
import numpy as np | |
import cv2 | |
try: | |
n_pages = pdfinfo_from_path(pdffile, userpw=None, poppler_path=None)['Pages'] | |
except Exception: | |
n_pages = len(glob(pdffile)) | |
print('Total pages:', n_pages) | |
print('In file:', pdffile) | |
print('Out file:', outf) | |
def delineate(s: str) -> str: | |
out = '' | |
for line in s.replace('\r\n', '\n').split('\n'): | |
if line.strip() == '': out = out + '\n\n\n' | |
if line.endswith('-') or line.endswith('—'): | |
out = (out + line[:-1]).strip() | |
else: | |
out = (out + line).strip() + ' ' | |
return out | |
def correct_skew(image, delta=1, limit=5): | |
def determine_score(arr, angle): | |
data = inter.rotate(arr, angle, reshape=False, order=0) | |
histogram = np.sum(data, axis=1, dtype=float) | |
score = np.sum((histogram[1:] - histogram[:-1]) ** 2, dtype=float) | |
return histogram, score | |
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] | |
scores = [] | |
angles = np.arange(-limit, limit + delta, delta) | |
for angle in angles: | |
histogram, score = determine_score(thresh, angle) | |
scores.append(score) | |
best_angle = angles[scores.index(max(scores))] | |
(h, w) = image.shape[:2] | |
center = (w // 2, h // 2) | |
M = cv2.getRotationMatrix2D(center, best_angle, 1.0) | |
corrected = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, \ | |
borderMode=cv2.BORDER_REPLICATE) | |
return best_angle, corrected | |
def print_pages(pdf_file): | |
for page_n in trange(1, n_pages + 1, 1, desc=f'Processing book: {pdffile}'): | |
(image,) = convert_from_path(pdf_file, first_page=page_n, last_page=page_n) | |
_, image = correct_skew(image) | |
d = pytesseract.image_to_string(image) | |
out.write(d) | |
tqdm.write(d) | |
def print_images(): | |
for page_n in range(1, 604): | |
if isfile(f'{page_n}.jpg'): | |
with Image.open(f'{page_n}.jpg') as image: | |
d = (pytesseract.image_to_string(image)) + f'\n\n-- page {page_n} --\n\n' | |
out.write(d) | |
tqdm.write(d) | |
print_images() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment