mwesten · April 20, 2016 13:53
diff --git a/xpdf.py b/xpdf.py
 #!/usr/bin/python
 # encoding: utf-8
 #
 # Copyright (c) 2014 Stephen Margheim <stephen.margheim@gmail.com>
 #
 # MIT Licence. See http://opensource.org/licenses/MIT
 #
 # Created on 12-08-2014
 #
 import os
 import re
 import sys
 import subprocess

 # SET THIS TO THE DIRECTORY THAT HOLDS YOUR `xpdf` CLI UTILITIES
 UTILS = '/usr/local/bin'


 def utilities():
    """Returns dictionary of paths to all internal PDF utilities.
    """
    utils = {}
    for (dirpath, dirnames, filenames) in os.walk(UTILS):
        for filename in filenames:
            util = filename.replace('pdf', '')
            utils[util] = os.path.join(dirpath, filename)
    return utils


 class PDF(object):
    """Represents a Portable Document Format document.
    """
    def __init__(self, pdf):
        self.pdf = pdf
        self.check()

    @property
    def info(self):
        """Extract the metadata for the given `pdf`.
        Returns metainfo in a dictionary.

        This function parses the text output that looks like this:
            Title:          PUBLIC MEETING AGENDA
            Author:         Customer Support
            Creator:        Microsoft Word 2010
            Producer:       Microsoft Word 2010
            CreationDate:   Thu Dec 20 14:44:56 2012
            ModDate:        Thu Dec 20 14:44:56 2012
            Tagged:         yes
            Pages:          2
            Encrypted:      no
            Page size:      612 x 792 pts (letter)
            File size:      104739 bytes
            Optimized:      no
            PDF version:    1.5
        """
        cmd = utilities()['info']
        output = {}
        cmd_output = subprocess.check_output([cmd, self.pdf])
        for line in cmd_output.splitlines():
            # split `key` from `val` intelligently
            key, val = [x.strip() for x in line.split(':', 1)]
            output[key] = val
        return output

    @property
    def text(self):
        """Extracts the plain text (if available) of the given `pdf`.
        """
        cmd = utilities()['totext']
        try:
            cmd_output = subprocess.check_output([cmd, '-q', self.pdf, '-'])
        except subprocess.CalledProcessError:
            cmd_output = '\x0c'
        # check if text is only Line Feeds (any number of them)
        if any(c.isalpha() for c in cmd_output):
            return cmd_output.replace('\x0c', '')
        else:
            return None
            

    @property
    def html(self):
        """Extracts the HTML (if available) of the given `pdf`.
        """
        cmd = utilities()['tohtml']
        pdfname = os.path.splitext(self.pdf)[0]
        pdfname = pdfname.split('/')[-1]
        try:
            subprocess.check_output([cmd, '-q', self.pdf])
        except subprocess.CalledProcessError:
            out_path = False
        return out_path

    

    @property
    def fonts(self):
        """Extracts the font information (if available) of the given `pdf`.

        This function parses text output that looks like this:
        name                            type              emb sub uni object ID
        ------------------------------- ----------------- --- --- --- ---------
        HJNFLI+AdvP5D8B                 Type 1C           yes yes no     126  0
        HJNDDC+Advpn800d                Type 1C           yes yes no     130  0
        HJNGMO+AdvLogo                  Type 1C           yes yes no     133  0
        HJNGEB+Advp404fe                Type 1C           yes yes no      68  0
        HJNFPE+Advpn8010                Type 1C           yes yes no      69  0
        HJNHHI+Advmp13                  Type 1C           yes yes no      71  0
        HKLEGN+Advp404fe                Type 1C           yes yes no      72  0
        HKLEDN+Advpn800d                Type 1C           yes yes no      75  0
        HKLBEE+Advpn8010                Type 1C           yes yes no      92  0
        HKMCLM+Advhg                    Type 1C           yes yes no      88  0
        """
        cmd = utilities()['fonts']
        output = []
        cmd_output = subprocess.check_output([cmd, self.pdf])
        for i, line in enumerate(cmd_output.splitlines()):
            if i == 0:
                # get names of columns as `keys`
                keys = line.split()
            else:
                if '-----' in line:
                    # ignore separator line
                    pass
                else:
                    # split result rows intelligently into `vals`
                    partial_vals = re.split(r'\s{3,}', line)
                    vals = partial_vals[:2]
                    vals.extend(partial_vals[2].split())
                    vals.extend(partial_vals[3].split())
                    output.append(dict(zip(keys, vals)))
        return output

    def is_ocrd(self):
        """Check is given `pdf` is OCRd.
        """
        if self.text:
            perc = self.ocrd_all()
            if isinstance(perc, bool):
                return perc
            else:
                if perc > 50:
                    return True
                else:
                    return False
        else:
            return False

    def ocrd_all(self):
        """Check every page for OCR.
        """
        total_pages = range(int(self.info['Pages']))
        return self._pages_ocrd(total_pages)

    def ocrd_half(self):
        """Check if random pages have text.
        """
        total_pages = int(self.info['Pages'])
        pages = range(total_pages)[::2]
        return self._pages_ocrd(pages)


    def _pages_ocrd(self, pages):
        """Check whether given pages of `pdf` have text.
        """
        cmd = utilities()['totext']
        results = []
        for page in pages:
            # Extract text of PDF page
            try:
                cmd_output = subprocess.check_output([cmd, '-q',
                                    "-f", str(page), "-l", str(page),
                                    self.pdf, '-'])
            except subprocess.CalledProcessError:
                cmd_output = '\x0c'
            if any(c.isalpha() for c in cmd_output):
                results.append(True)
            else:
                results.append(False)
        if len(set(results)) == 1:
            return results[0]
        else:
            if self.fonts == []:
                return False
            ocrd = results.count(True)
            return self.percentage(ocrd, len(pages))
    

    def check(self):
        """Ensure input pdf path is valid.
        """
        if not os.path.exists(self.pdf):
            raise RuntimeError('Provided input file not found: %s' % self.pdf)

    @staticmethod
    def percentage(part, whole):
        """Return float with two decimal points of percentage.
        """
        if whole == 0:
            return 0
        else:
            perc = 100 * float(part)/float(whole)
            return float("{0:.2f}".format(perc))


    

 def main():
    """Iterate thru PDF files."""
    #pdfs = '/Users/smargheim/Documents/PDFs/Non-OCR/'
    #for (dirpath, dirnames, filenames) in os.walk(pdfs):
    #    for filename in filenames:
    #        if filename.endswith('pdf'):
    #            pdf = os.path.join(dirpath, filename)
    #            p = PDF(wf, pdf)
    #            if p.is_ocrd():
    #                print pdf
    pdf = sys.argv[1]
    #pdf = '/Users/smargheim/Documents/PDFs/Non-OCR/Ch. 1-2.pdf'
    print PDF(pdf).is_ocrd()

 if __name__ == '__main__':
    main()
	#!/usr/bin/python
	# encoding: utf-8
	#
	# Copyright (c) 2014 Stephen Margheim <stephen.margheim@gmail.com>
	#
	# MIT Licence. See http://opensource.org/licenses/MIT
	#
	# Created on 12-08-2014
	#
	import os
	import re
	import sys
	import subprocess

	# SET THIS TO THE DIRECTORY THAT HOLDS YOUR `xpdf` CLI UTILITIES
	UTILS = '/usr/local/bin'


	def utilities():
	"""Returns dictionary of paths to all internal PDF utilities.
	"""
	utils = {}
	for (dirpath, dirnames, filenames) in os.walk(UTILS):
	for filename in filenames:
	util = filename.replace('pdf', '')
	utils[util] = os.path.join(dirpath, filename)
	return utils


	class PDF(object):
	"""Represents a Portable Document Format document.
	"""
	def __init__(self, pdf):
	self.pdf = pdf
	self.check()

	@property
	def info(self):
	"""Extract the metadata for the given `pdf`.
	Returns metainfo in a dictionary.

	This function parses the text output that looks like this:
	Title: PUBLIC MEETING AGENDA
	Author: Customer Support
	Creator: Microsoft Word 2010
	Producer: Microsoft Word 2010
	CreationDate: Thu Dec 20 14:44:56 2012
	ModDate: Thu Dec 20 14:44:56 2012
	Tagged: yes
	Pages: 2
	Encrypted: no
	Page size: 612 x 792 pts (letter)
	File size: 104739 bytes
	Optimized: no
	PDF version: 1.5
	"""
	cmd = utilities()['info']
	output = {}
	cmd_output = subprocess.check_output([cmd, self.pdf])
	for line in cmd_output.splitlines():
	# split `key` from `val` intelligently
	key, val = [x.strip() for x in line.split(':', 1)]
	output[key] = val
	return output

	@property
	def text(self):
	"""Extracts the plain text (if available) of the given `pdf`.
	"""
	cmd = utilities()['totext']
	try:
	cmd_output = subprocess.check_output([cmd, '-q', self.pdf, '-'])
	except subprocess.CalledProcessError:
	cmd_output = '\x0c'
	# check if text is only Line Feeds (any number of them)
	if any(c.isalpha() for c in cmd_output):
	return cmd_output.replace('\x0c', '')
	else:
	return None


	@property
	def html(self):
	"""Extracts the HTML (if available) of the given `pdf`.
	"""
	cmd = utilities()['tohtml']
	pdfname = os.path.splitext(self.pdf)[0]
	pdfname = pdfname.split('/')[-1]
	try:
	subprocess.check_output([cmd, '-q', self.pdf])
	except subprocess.CalledProcessError:
	out_path = False
	return out_path



	@property
	def fonts(self):
	"""Extracts the font information (if available) of the given `pdf`.

	This function parses text output that looks like this:
	name type emb sub uni object ID
	------------------------------- ----------------- --- --- --- ---------
	HJNFLI+AdvP5D8B Type 1C yes yes no 126 0
	HJNDDC+Advpn800d Type 1C yes yes no 130 0
	HJNGMO+AdvLogo Type 1C yes yes no 133 0
	HJNGEB+Advp404fe Type 1C yes yes no 68 0
	HJNFPE+Advpn8010 Type 1C yes yes no 69 0
	HJNHHI+Advmp13 Type 1C yes yes no 71 0
	HKLEGN+Advp404fe Type 1C yes yes no 72 0
	HKLEDN+Advpn800d Type 1C yes yes no 75 0
	HKLBEE+Advpn8010 Type 1C yes yes no 92 0
	HKMCLM+Advhg Type 1C yes yes no 88 0
	"""
	cmd = utilities()['fonts']
	output = []
	cmd_output = subprocess.check_output([cmd, self.pdf])
	for i, line in enumerate(cmd_output.splitlines()):
	if i == 0:
	# get names of columns as `keys`
	keys = line.split()
	else:
	if '-----' in line:
	# ignore separator line
	pass
	else:
	# split result rows intelligently into `vals`
	partial_vals = re.split(r'\s{3,}', line)
	vals = partial_vals[:2]
	vals.extend(partial_vals[2].split())
	vals.extend(partial_vals[3].split())
	output.append(dict(zip(keys, vals)))
	return output

	def is_ocrd(self):
	"""Check is given `pdf` is OCRd.
	"""
	if self.text:
	perc = self.ocrd_all()
	if isinstance(perc, bool):
	return perc
	else:
	if perc > 50:
	return True
	else:
	return False
	else:
	return False

	def ocrd_all(self):
	"""Check every page for OCR.
	"""
	total_pages = range(int(self.info['Pages']))
	return self._pages_ocrd(total_pages)

	def ocrd_half(self):
	"""Check if random pages have text.
	"""
	total_pages = int(self.info['Pages'])
	pages = range(total_pages)[::2]
	return self._pages_ocrd(pages)


	def _pages_ocrd(self, pages):
	"""Check whether given pages of `pdf` have text.
	"""
	cmd = utilities()['totext']
	results = []
	for page in pages:
	# Extract text of PDF page
	try:
	cmd_output = subprocess.check_output([cmd, '-q',
	"-f", str(page), "-l", str(page),
	self.pdf, '-'])
	except subprocess.CalledProcessError:
	cmd_output = '\x0c'
	if any(c.isalpha() for c in cmd_output):
	results.append(True)
	else:
	results.append(False)
	if len(set(results)) == 1:
	return results[0]
	else:
	if self.fonts == []:
	return False
	ocrd = results.count(True)
	return self.percentage(ocrd, len(pages))


	def check(self):
	"""Ensure input pdf path is valid.
	"""
	if not os.path.exists(self.pdf):
	raise RuntimeError('Provided input file not found: %s' % self.pdf)

	@staticmethod
	def percentage(part, whole):
	"""Return float with two decimal points of percentage.
	"""
	if whole == 0:
	return 0
	else:
	perc = 100 * float(part)/float(whole)
	return float("{0:.2f}".format(perc))




	def main():
	"""Iterate thru PDF files."""
	#pdfs = '/Users/smargheim/Documents/PDFs/Non-OCR/'
	#for (dirpath, dirnames, filenames) in os.walk(pdfs):
	# for filename in filenames:
	# if filename.endswith('pdf'):
	# pdf = os.path.join(dirpath, filename)
	# p = PDF(wf, pdf)
	# if p.is_ocrd():
	# print pdf
	pdf = sys.argv[1]
	#pdf = '/Users/smargheim/Documents/PDFs/Non-OCR/Ch. 1-2.pdf'
	print PDF(pdf).is_ocrd()

	if __name__ == '__main__':
	main()
No results found