Last active
March 8, 2017 21:25
-
-
Save J08nY/e965805543a9b2a72672 to your computer and use it in GitHub Desktop.
PDF page counter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from PyPDF2 import PdfFileReader | |
from PyPDF2.utils import PdfReadError | |
import re | |
import argparse | |
from os import walk, path | |
import magic | |
import hashlib | |
class Pdf(object): | |
pdf_pages = re.compile(r"/Type\s*/Page([^s]|$)", re.MULTILINE|re.DOTALL) | |
def __init__(self, path): | |
self.path = path | |
self.length = None | |
self.hash = self.rehash() | |
def count(self): | |
with open(self.path, "rb") as f: | |
data = f.read() | |
lines = len(Pdf.pdf_pages.findall(data)) | |
if lines == 0: | |
f.seek(0) | |
try: | |
lines = PdfFileReader(f).getNumPages() | |
except PdfReadError as pre: | |
lines = 0 | |
return lines | |
def __len__(self): | |
if self.length is None: | |
self.length = self.count() | |
return self.length | |
def rehash(self): | |
blocksize = 65536 | |
md5 = hashlib.md5() | |
with open(self.path, "rb") as f: | |
buf = f.read(blocksize) | |
while len(buf) > 0: | |
md5.update(buf) | |
buf = f.read(blocksize) | |
return md5.hexdigest() | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Counts the number of pages in pdf files in a directory.") | |
parser.add_argument("-e", "--exclude", dest="excludes", action="append", help="Directories/files to exclude") | |
parser.add_argument("directory", nargs="?", default=".", help="Directory to count") | |
parser.add_argument("-r", "--recursive", dest="recursive", action="store_true", help="Recurse into subdirs.") | |
args = parser.parse_args() | |
if not path.exists(args.directory): | |
print "Directory/File doesnt exist." | |
exit(1) | |
pages = 0 | |
seen = [] | |
m = magic.open(magic.MAGIC_MIME) | |
m.load() | |
if path.isfile(args.directory): | |
pdf = Pdf(args.directory) | |
pages = pdf.count() | |
print args.directory, pages | |
else: | |
for root, dirs, files in walk(args.directory): | |
if args.excludes is not None: | |
dirs[:] = [d for d in dirs if d not in args.excludes] | |
files[:] = [f for f in files if f not in args.excludes] | |
for f in files: | |
file_path = path.join(root, f) | |
file_type = m.file(file_path) | |
if "application/pdf" in file_type: | |
pdf = Pdf(file_path) | |
if pdf.hash in seen: | |
continue | |
seen.append(pdf.hash) | |
p = pdf.count() | |
print f, p | |
pages+=p | |
if not args.recursive: | |
break | |
print "## Total:", pages |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment