Skip to content

Instantly share code, notes, and snippets.

@XBigTK13X
Last active October 29, 2024 18:07
Show Gist options
  • Save XBigTK13X/4796a0ca7f16e83438914384a57dc46b to your computer and use it in GitHub Desktop.
Save XBigTK13X/4796a0ca7f16e83438914384a57dc46b to your computer and use it in GitHub Desktop.
Extracts images from a PDF and attempts to compose any matching image masks.
#! /usr/bin/python3
# This script requires pdfimage (poppler-utils) and convert (imagemagick)
# Raw images will be written to <OUTPUT_DIR>/15-organized
# Attempts at merging masks and images will be output to <OUTPUT_DIR/30-masked>
# A sample of one image using all compose methods will be written to <OUTPUT_DIR>/25-samples
# Rewritten from https://gist.github.com/bendavis78/ed22a974c2b4534305eabb2522956359
import os
import sys
import subprocess
import shutil
QUIET = False
COMPOSITIONS = [
"CopyOpacity",
]
def log(message):
global QUIET
if not QUIET:
print(message)
if len(sys.argv) >= 6:
QUIET = True
if len(sys.argv) < 2:
print("An input PDF file is required")
sys.exit(1)
if len(sys.argv) < 3:
print("An output directory is required")
sys.exit(1)
if len(sys.argv) < 4 or sys.argv[3] == "all":
log("Will only attempt CopyOpacity composition")
else:
log(f'Will attempt [{sys.argv[3]}] compositions')
COMPOSITIONS = sys.argv[3].split(',')
SAMPLE_IMAGE_NUM = 1
if len(sys.argv) >= 5:
log(f'Will copy samples using image [{sys.argv[4]}]')
SAMPLE_IMAGE_NUM = int(sys.argv[4])
INPUT_PDF_FILE=sys.argv[1]
OUTPUT_DIR=sys.argv[2]
if os.path.exists(OUTPUT_DIR):
shutil.rmtree(OUTPUT_DIR)
def execute(command):
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
result = process.returncode
if result != 0:
print("An error occurred while running {}".format(command))
print("stdout: {}".format(stdout))
print("stderr: {}".format(stderr))
sys.exit(1)
return {
"result": result,
"stdout": stdout.decode('utf-8').split('\n'),
"stderr": stderr.decode('utf-8').split('\n')
}
EXTRACT_DIR=os.path.join(OUTPUT_DIR,"10-extract")
if not os.path.exists(EXTRACT_DIR):
os.makedirs(EXTRACT_DIR,exist_ok=True)
log(f"Extract image data from PDF to [{EXTRACT_DIR}]")
command = f'pdfimages -png "{INPUT_PDF_FILE}" "{EXTRACT_DIR}/image"'
execute(command)
log("Gather extracted image paths")
extracted_images = {}
for root,dirs,files in os.walk(EXTRACT_DIR):
for ff in files:
image_num = int(ff.split('-')[1].split('.')[0])
extracted_images[image_num] = os.path.join(root,ff)
metadata_parts = [
'page',
'num',
'type',
'width',
'height',
'color',
'comp',
'bpc',
'enc',
'interop',
'object',
'id',
'x_ppi',
'y_ppi',
'size',
'ratio'
]
class PdfImageMetadata:
def __init__(self, text):
global metadata_parts
parts = text.split()
for meta in metadata_parts:
if len(parts) <= 0:
break
self.__setattr__(meta, parts.pop(0))
self.num = int(self.num)
self.object = int(self.object)
pdf_objects = {}
log("Parse PDF image metadata")
command = f'pdfimages -list "{INPUT_PDF_FILE}"'
list_results = execute(command)
count = 0
for line in list_results['stdout']:
count += 1
if count < 3:
continue
if len(line) <= 2:
continue
image = PdfImageMetadata(line)
if not 'image' in image.type and not 'smask' in image.type:
continue
if not image.object in pdf_objects:
pdf_objects[image.object] = {}
pdf_objects[image.object][image.type] = image
MASKED_DIR = os.path.join(OUTPUT_DIR,'30-masked')
os.makedirs(MASKED_DIR, exist_ok=True)
SAMPLE_DIR = os.path.join(OUTPUT_DIR,'25-samples')
os.makedirs(SAMPLE_DIR, exist_ok=True)
ORGANIZE_DIR = os.path.join(OUTPUT_DIR,'15-organized')
os.makedirs(ORGANIZE_DIR, exist_ok=True)
RAW_MASK_DIR = os.path.join(ORGANIZE_DIR,'mask')
os.makedirs(RAW_MASK_DIR, exist_ok=True)
RAW_IMAGE_DIR =os.path.join(ORGANIZE_DIR,'image')
os.makedirs(RAW_IMAGE_DIR)
def compose(image, mask, destination, mode, prefix):
merged_dir = os.path.join(MASKED_DIR,prefix,mode)
os.makedirs(merged_dir,exist_ok=True)
merged_file = f'{destination:05d}.png'
merged_path = os.path.join(merged_dir,merged_file)
command = f'convert "{image}" "{mask}" -compose {mode} -composite "{merged_path}"'
execute(command)
if destination == SAMPLE_IMAGE_NUM:
sample_path = os.path.join(SAMPLE_DIR,f'{prefix}-{mode}-{destination:05d}.png')
shutil.copy(merged_path,sample_path)
log("Merging masked images, copying standalone images")
merged_count = 0
standalone_count = 0
images_counted = False
mode_count = 0
for mode in COMPOSITIONS:
mode_count += 1
log(f"Compose images using mode ({mode_count}/{len(COMPOSITIONS)}) [{mode}]")
for k,v in pdf_objects.items():
if 'smask' in v and 'image' in v:
image = extracted_images[v['image'].num]
mask = extracted_images[v['smask'].num]
shutil.copy(image, os.path.join(RAW_IMAGE_DIR,f"{v['image'].num}.png"))
shutil.copy(mask, os.path.join(RAW_MASK_DIR,f"{v['smask'].num}.png"))
compose(image,mask,v['image'].num,mode,"image+mask")
if not images_counted:
merged_count += 1
elif 'image' in v:
source = extracted_images[v['image'].num]
shutil.copy(source,os.path.join(RAW_IMAGE_DIR,f"{v['image'].num}.png"))
images_counted = True
log(f"Raw images sorted in [{ORGANIZE_DIR}]")
log(f"{merged_count} masked images merged in [{len(COMPOSITIONS)}] ways to [{MASKED_DIR}]")
@revilowaldow
Copy link

revilowaldow commented Mar 24, 2024

FYI I've further adapted this to a specific use case:
https://gist.github.com/revilowaldow/2d7a551685c5198fea42b285a2e30223

Thanks for this! Really helpful :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment