Skip to content

Instantly share code, notes, and snippets.

@Luxter77
Last active May 3, 2025 01:04
Show Gist options
  • Save Luxter77/06c6183d596b6a9d631a43f8a1b8233d to your computer and use it in GitHub Desktop.
Save Luxter77/06c6183d596b6a9d631a43f8a1b8233d to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
from genericpath import isfile
from pdf2image import convert_from_path, pdfinfo_from_path
import collections, sys, os
from tqdm.auto import tqdm, trange
from glob import glob
try:
from PIL import Image
except ImportError:
import Image
import pytesseract
try:
pdffile = input("Pdf file to scan: ")
except KeyboardInterrupt:
sys.exit()
if 'pdf' in pdffile.lower():
outf = pdffile.lower().replace('pdf', 'txt').replace('*', '')
else:
outf = pdffile + ('' if 'txt' in pdffile else '.txt')
out = open(outf, "w", encoding='utf-8')
from scipy.ndimage import interpolation as inter
import numpy as np
import cv2
try:
n_pages = pdfinfo_from_path(pdffile, userpw=None, poppler_path=None)['Pages']
except Exception:
n_pages = len(glob(pdffile))
print('Total pages:', n_pages)
print('In file:', pdffile)
print('Out file:', outf)
def delineate(s: str) -> str:
out = ''
for line in s.replace('\r\n', '\n').split('\n'):
if line.strip() == '': out = out + '\n\n\n'
if line.endswith('-') or line.endswith('—'):
out = (out + line[:-1]).strip()
else:
out = (out + line).strip() + ' '
return out
def correct_skew(image, delta=1, limit=5):
def determine_score(arr, angle):
data = inter.rotate(arr, angle, reshape=False, order=0)
histogram = np.sum(data, axis=1, dtype=float)
score = np.sum((histogram[1:] - histogram[:-1]) ** 2, dtype=float)
return histogram, score
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
scores = []
angles = np.arange(-limit, limit + delta, delta)
for angle in angles:
histogram, score = determine_score(thresh, angle)
scores.append(score)
best_angle = angles[scores.index(max(scores))]
(h, w) = image.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, best_angle, 1.0)
corrected = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, \
borderMode=cv2.BORDER_REPLICATE)
return best_angle, corrected
def print_pages(pdf_file):
for page_n in trange(1, n_pages + 1, 1, desc=f'Processing book: {pdffile}'):
(image,) = convert_from_path(pdf_file, first_page=page_n, last_page=page_n)
_, image = correct_skew(image)
d = pytesseract.image_to_string(image)
out.write(d)
tqdm.write(d)
def print_images():
for page_n in range(1, 604):
if isfile(f'{page_n}.jpg'):
with Image.open(f'{page_n}.jpg') as image:
d = (pytesseract.image_to_string(image)) + f'\n\n-- page {page_n} --\n\n'
out.write(d)
tqdm.write(d)
print_images()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment