Created
October 7, 2020 16:32
-
-
Save jangxx/bd9256009b6698f1550fb7034003f877 to your computer and use it in GitHub Desktop.
Extract all the layers of a PDF file into their own files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import pikepdf | |
if len(sys.argv) < 3: | |
print("Usage: extract_layers.py <input.pdf> <output_template>") | |
print() | |
print("Output template has to contain {num} somewhere, which will be replaced by the layer number.") | |
sys.exit(0) | |
# check if we even have some OCGs | |
pdf = pikepdf.open(sys.argv[1]) | |
try: | |
layers = pdf.root.OCProperties.OCGs | |
except (AttributeError, KeyError): | |
print("Unable to locate layers in PDF.") | |
sys.exit(1) | |
page_count = len(pdf.pages) | |
pdf.close() | |
# (hopefully) all pdf operators which "display" anything. everything else is styling, which we need to preserve | |
hidden_operators = ["S", "s", "f" "F", "f*", "B", "B*", "b", "b*", "n", "Do", "sh", "Tj", "TJ", "m", "l", "c", "v", "y", "h", "re"] | |
extracted_groups = [] | |
cur_layer = 0 | |
for i in range(page_count): | |
end_reached = False | |
while not end_reached: | |
commands = [] | |
extract_commands = True | |
extracted_one = False | |
pdf = pikepdf.open(sys.argv[1]) | |
page = pdf.pages[i] | |
for j in range(len(pdf.pages)): | |
if i < j: | |
del pdf.pages[1] | |
elif i > j: | |
del pdf.pages[0] | |
for operands, operator in pikepdf.parse_content_stream(page): | |
if "/OC" in operands: # new OCG starts | |
ocg_name = operands[1] | |
if not ocg_name in extracted_groups and not extracted_one: | |
extracted_groups.append(ocg_name) | |
extract_commands = True | |
extracted_one = True | |
else: | |
extract_commands = False | |
if str(operator) == "EMC": # OCG has ended | |
extract_commands = True | |
continue | |
if extract_commands or (not extract_commands and str(operator) not in hidden_operators): | |
commands.append([ operands, operator ]) | |
# if cur_layer == 6: | |
# print("Operands {}, operator {}".format(operands, operator)) | |
if not extracted_one: | |
end_reached = True | |
else: | |
page.Contents = pdf.make_stream(pikepdf.unparse_content_stream(commands)) | |
pdf.save(sys.argv[2].format(num=cur_layer)) | |
cur_layer += 1 |
In case anyone else comes across this in future, the fix for the line:
if "/OC" in operands: # new OCG starts
is to change it to:
if pikepdf.Name("/OC") in operands: # new OCG starts
I did try to install an older version of pikepdf with pip install pikepdf==2.16.1
but that failed when it tries to build qpdf, which is the C++ library that pikepdf wraps, as far as I can tell.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for the reply, will give that a shot. Wasn't sure how easy it was to install old versions of libraries with python, in past experience with other languages I've sometimes found it simpler just to fix the code.