-
-
Save fractaledmind/7538f031ebf1eaf1367a to your computer and use it in GitHub Desktop.
Access to common `xpdf` CLI utilities, but adds ability to determine if PDF has been OCRd. Built for an Alfred Workflow
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# encoding: utf-8 | |
# | |
# Copyright (c) 2014 Stephen Margheim <[email protected]> | |
# | |
# MIT Licence. See http://opensource.org/licenses/MIT | |
# | |
# Created on 12-08-2014 | |
# | |
import os | |
import re | |
import sys | |
import subprocess | |
# SET THIS TO THE DIRECTORY THAT HOLDS YOUR `xpdf` CLI UTILITIES | |
UTILS = '/Users/smargheim/Dropbox/Alfred.alfredpreferences/workflows/user.workflow.0B84B188-4850-403C-BC75-FFC12ADBFC45/bin64' | |
def utilities(): | |
"""Returns dictionary of paths to all internal PDF utilities. | |
""" | |
utils = {} | |
for (dirpath, dirnames, filenames) in os.walk(UTILS): | |
for filename in filenames: | |
util = filename.replace('pdf', '') | |
utils[util] = os.path.join(dirpath, filename) | |
return utils | |
class PDF(object): | |
"""Represents a Portable Document Format document. | |
""" | |
def __init__(self, pdf): | |
self.pdf = pdf | |
self.check() | |
@property | |
def info(self): | |
"""Extract the metadata for the given `pdf`. | |
Returns metainfo in a dictionary. | |
This function parses the text output that looks like this: | |
Title: PUBLIC MEETING AGENDA | |
Author: Customer Support | |
Creator: Microsoft Word 2010 | |
Producer: Microsoft Word 2010 | |
CreationDate: Thu Dec 20 14:44:56 2012 | |
ModDate: Thu Dec 20 14:44:56 2012 | |
Tagged: yes | |
Pages: 2 | |
Encrypted: no | |
Page size: 612 x 792 pts (letter) | |
File size: 104739 bytes | |
Optimized: no | |
PDF version: 1.5 | |
""" | |
cmd = utilities()['info'] | |
output = {} | |
cmd_output = subprocess.check_output([cmd, self.pdf]) | |
for line in cmd_output.splitlines(): | |
# split `key` from `val` intelligently | |
key, val = [x.strip() for x in line.split(':', 1)] | |
output[key] = val | |
return output | |
@property | |
def text(self): | |
"""Extracts the plain text (if available) of the given `pdf`. | |
""" | |
cmd = utilities()['totext'] | |
try: | |
cmd_output = subprocess.check_output([cmd, '-q', self.pdf, '-']) | |
except subprocess.CalledProcessError: | |
cmd_output = '\x0c' | |
# check if text is only Line Feeds (any number of them) | |
if any(c.isalpha() for c in cmd_output): | |
return cmd_output.replace('\x0c', '') | |
else: | |
return None | |
@property | |
def html(self): | |
"""Extracts the HTML (if available) of the given `pdf`. | |
""" | |
cmd = utilities()['tohtml'] | |
pdfname = os.path.splitext(self.pdf)[0] | |
pdfname = pdfname.split('/')[-1] | |
try: | |
subprocess.check_output([cmd, '-q', self.pdf]) | |
except subprocess.CalledProcessError: | |
out_path = False | |
return out_path | |
@property | |
def fonts(self): | |
"""Extracts the font information (if available) of the given `pdf`. | |
This function parses text output that looks like this: | |
name type emb sub uni object ID | |
------------------------------- ----------------- --- --- --- --------- | |
HJNFLI+AdvP5D8B Type 1C yes yes no 126 0 | |
HJNDDC+Advpn800d Type 1C yes yes no 130 0 | |
HJNGMO+AdvLogo Type 1C yes yes no 133 0 | |
HJNGEB+Advp404fe Type 1C yes yes no 68 0 | |
HJNFPE+Advpn8010 Type 1C yes yes no 69 0 | |
HJNHHI+Advmp13 Type 1C yes yes no 71 0 | |
HKLEGN+Advp404fe Type 1C yes yes no 72 0 | |
HKLEDN+Advpn800d Type 1C yes yes no 75 0 | |
HKLBEE+Advpn8010 Type 1C yes yes no 92 0 | |
HKMCLM+Advhg Type 1C yes yes no 88 0 | |
""" | |
cmd = utilities()['fonts'] | |
output = [] | |
cmd_output = subprocess.check_output([cmd, self.pdf]) | |
for i, line in enumerate(cmd_output.splitlines()): | |
if i == 0: | |
# get names of columns as `keys` | |
keys = line.split() | |
else: | |
if '-----' in line: | |
# ignore separator line | |
pass | |
else: | |
# split result rows intelligently into `vals` | |
partial_vals = re.split(r'\s{3,}', line) | |
vals = partial_vals[:2] | |
vals.extend(partial_vals[2].split()) | |
vals.extend(partial_vals[3].split()) | |
output.append(dict(zip(keys, vals))) | |
return output | |
def is_ocrd(self): | |
"""Check is given `pdf` is OCRd. | |
""" | |
if self.text: | |
perc = self.ocrd_all() | |
if isinstance(perc, bool): | |
return perc | |
else: | |
if perc > 50: | |
return True | |
else: | |
return False | |
else: | |
return False | |
def ocrd_all(self): | |
"""Check every page for OCR. | |
""" | |
total_pages = range(int(self.info['Pages'])) | |
return self._pages_ocrd(total_pages) | |
def ocrd_half(self): | |
"""Check if random pages have text. | |
""" | |
total_pages = int(self.info['Pages']) | |
pages = range(total_pages)[::2] | |
return self._pages_ocrd(pages) | |
def _pages_ocrd(self, pages): | |
"""Check whether given pages of `pdf` have text. | |
""" | |
cmd = utilities()['totext'] | |
results = [] | |
for page in pages: | |
# Extract text of PDF page | |
try: | |
cmd_output = subprocess.check_output([cmd, '-q', | |
"-f", str(page), "-l", str(page), | |
self.pdf, '-']) | |
except subprocess.CalledProcessError: | |
cmd_output = '\x0c' | |
if any(c.isalpha() for c in cmd_output): | |
results.append(True) | |
else: | |
results.append(False) | |
if len(set(results)) == 1: | |
return results[0] | |
else: | |
if self.fonts == []: | |
return False | |
ocrd = results.count(True) | |
return self.percentage(ocrd, len(pages)) | |
def check(self): | |
"""Ensure input pdf path is valid. | |
""" | |
if not os.path.exists(self.pdf): | |
raise RuntimeError('Provided input file not found: %s' % self.pdf) | |
@staticmethod | |
def percentage(part, whole): | |
"""Return float with two decimal points of percentage. | |
""" | |
if whole == 0: | |
return 0 | |
else: | |
perc = 100 * float(part)/float(whole) | |
return float("{0:.2f}".format(perc)) | |
def main(): | |
"""Iterate thru PDF files.""" | |
#pdfs = '/Users/smargheim/Documents/PDFs/Non-OCR/' | |
#for (dirpath, dirnames, filenames) in os.walk(pdfs): | |
# for filename in filenames: | |
# if filename.endswith('pdf'): | |
# pdf = os.path.join(dirpath, filename) | |
# p = PDF(wf, pdf) | |
# if p.is_ocrd(): | |
# print pdf | |
pdf = sys.argv[1] | |
#pdf = '/Users/smargheim/Documents/PDFs/Non-OCR/Ch. 1-2.pdf' | |
print PDF(pdf).is_ocrd() | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment