Created
January 13, 2010 10:36
-
-
Save mhl/276099 to your computer and use it in GitHub Desktop.
Split a PDF into pages with graphics and those without
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python2.5 | |
# This script attempts to take a PDF file and split it into two PDF | |
# files, one of which has all the images and the other which has | |
# everything else. You can select pages by whether they contain | |
# /Subtype /Image (bitmaps in my case) or /XObject which seems to | |
# catch bitmaps and included PDF files (which are most diagrams | |
# exported to PDF in my case). | |
import os | |
import sys | |
import tempfile | |
from optparse import OptionParser | |
import subprocess | |
import glob | |
import re | |
# Save the original working directory: | |
owd = os.getcwd() | |
parser = OptionParser("Usage: %prog [options] <INPUT-PDF> <OUTPUT-WITH-IMAGES-PDF> <OUTPUT-WITH-TEXT-PDF>") | |
parser.add_option('-i', '--images', dest='images', action="store_true", | |
default=False, help='include pages with images') | |
parser.add_option('-x', '--xobjects', dest='xobjects', action="store_true", | |
default=False, help='include pages with XObjects') | |
parser.add_option('-v', '--verbose', dest='verbose', action="store_true", | |
default=False, help='include pages with XObjects') | |
(options, args) = parser.parse_args() | |
if not (options.images or options.xobjects): | |
print "You must specify one or more of --images or --xobjects" | |
sys.exit(1) | |
if not len(args) == 3: | |
parser.print_help() | |
sys.exit(1) | |
input_filename = os.path.realpath(args[0]) | |
output_filename_images = os.path.realpath(args[1]) | |
output_filename_text = os.path.realpath(args[2]) | |
if options.verbose: | |
print "Creating temporary directory..." | |
temporary_directory = tempfile.mkdtemp() | |
if not temporary_directory: | |
print "Creating a temporary directory failed" | |
sys.exit(1) | |
# A method to get the page number from a filename, or return -1 if malformed: | |
def extract_page_number_from_filename(s): | |
m = re.search("(\d+)",s) | |
if m: | |
return int(m.group(1),10) | |
else: | |
return -1 | |
try: | |
# (It doesn't seem possible to control where pdftk dumps the | |
# doc_data.txt file, so change to the temporary directory.) | |
os.chdir(temporary_directory) | |
if options.verbose: | |
print "Splitting the PDF (%s) into pages..." % (input_filename,) | |
result = subprocess.call(["pdftk",input_filename,"burst","output","page_%09d.pdf"]) | |
if result != 0: | |
print "Splitting the PDF (%s) into pages with pdftk failed" % (input_filename,) | |
sys.exit(3) | |
text_pages = [] | |
image_pages = [] | |
page_filenames = glob.glob("page_*.pdf") | |
page_filenames.sort( key=extract_page_number_from_filename ) | |
for page_filename in page_filenames: | |
page_number = extract_page_number_from_filename(page_filename) | |
# Now extract the uncompressed version of that single page: | |
if options.verbose: | |
print "Examining page %d (%s)" % (page_number,page_filename) | |
uncompressed = subprocess.Popen(["pdftk",page_filename,"output","-","uncompress"], stdout=subprocess.PIPE).communicate()[0] | |
if options.images and re.search('/Subtype /Image',uncompressed): | |
image_pages.append(page_number) | |
elif options.xobjects and re.search('/XObject',uncompressed): | |
image_pages.append(page_number) | |
else: | |
text_pages.append(page_number) | |
# So now call pdftk twice, once to extract the image pages, and | |
# once to extract the text pages: | |
command_start = ["pdftk",input_filename,"cat"] | |
command_end = ["output"] | |
extract_images_command = command_start + map(str,image_pages) + command_end + [ output_filename_images ] | |
extract_text_command = command_start + map(str,text_pages) + command_end + [ output_filename_text ] | |
if options.verbose: | |
print "Concatenating pages with images to: %s)" % (output_filename_images,) | |
result = subprocess.call(extract_images_command) | |
if result != 0: | |
print "Extracting the pages with images failed; the command was:" | |
print " "+" ".join(extract_images_command) | |
sys.exit(4) | |
if options.verbose: | |
print "Concatenating pages with text to: %s)" % (output_filename_text,) | |
result = subprocess.call(extract_text_command) | |
if result != 0: | |
print "Extracting the pages with text failed; the command was:" | |
print " "+" ".join(extract_text_command) | |
sys.exit(5) | |
finally: | |
os.chdir(owd) | |
if options.verbose: | |
print "Removing the temporary directory: %s" % (temporary_directory,) | |
subprocess.call(["rm","-rf",temporary_directory]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment