Skip to content

Instantly share code, notes, and snippets.

@mnyrop
Created November 8, 2017 18:48
Show Gist options
  • Save mnyrop/f2d5be09c421260782ea5d5b06750401 to your computer and use it in GitHub Desktop.
Save mnyrop/f2d5be09c421260782ea5d5b06750401 to your computer and use it in GitHub Desktop.
from os import path
from glob import glob
from tqdm import tqdm
import time
from pyPdf import PdfFileWriter, PdfFileReader
def find_ext(dr, ext):
return glob(path.join(dr,"*.{}".format(ext)))
files = find_ext("pdfs","pdf")
def split(pdfs):
for item in tqdm(pdfs):
inputpdf = PdfFileReader(open(item, "rb"))
stem = item.replace("pdfs/","").replace(".pdf", "")
for i in xrange(inputpdf.numPages):
output = PdfFileWriter()
output.addPage(inputpdf.getPage(i))
outpath = "split-pdfs/" + stem + "-p" + str(i) + ".pdf"
with open(outpath, "wb") as outputStream:
output.write(outputStream)
time.sleep(.1)
TXT_DIR="texts"
if [ ! -d "${TXT_DIR}" ];
then
mkdir ${TXT_DIR}
fi
for TIF in *.tif; do
TXT_NAME=$(basename ${TIF} .tif)
TXT_FILE="${TXT_NAME}.txt"
tesseract ${TIF} ${TXT_NAME}
mv ${TXT_FILE} ${TXT_DIR}
echo "Converted ${TXT_NAME}"
done
mv ${TXT_DIR} ..
TIF_DIR="tif"
if [ ! -d "${TIF_DIR}" ];
then
mkdir ${TIF_DIR}
fi
for PAGE in *.pdf; do
TIF_NAME=$(basename ${PAGE} .pdf).tif
gs -q -dNOPAUSE -sDEVICE=tiffg4 -sOutputFile=${TIF_NAME} ${PAGE} -c quit
mv ${TIF_NAME} ${TIF_DIR}
echo "Converted ${TIF_NAME}"
done
mv ${TIF_DIR} ..
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment