Skip to content

Instantly share code, notes, and snippets.

@marcgeld
Created September 10, 2018 12:49
Show Gist options
  • Select an option

  • Save marcgeld/715be4bead9c6a00dfad7424b070bfbc to your computer and use it in GitHub Desktop.

Select an option

Save marcgeld/715be4bead9c6a00dfad7424b070bfbc to your computer and use it in GitHub Desktop.
Groovy script that extracts images from a pdf file.
#! /usr/bin/env groovy
//@GrabConfig(systemClassLoader=true)
@Grab(group='ch.qos.logback', module='logback-classic', version='1.2.3')
@Grab(group='org.apache.pdfbox', module='pdfbox', version='2.0.11')
@Grab(group='commons-io', module='commons-io', version='2.6')
import org.apache.pdfbox.pdfwriter.*
import org.apache.pdfbox.pdmodel.*
import org.apache.pdfbox.pdmodel.font.*
import org.apache.pdfbox.pdmodel.edit.*
import org.apache.pdfbox.pdmodel.graphics.*
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject
import java.awt.image.BufferedImage
import javax.imageio.ImageIO
import org.apache.pdfbox.cos.COSName
import java.nio.*
def appName = this.getClass().getName()
def fileType = "png"
// Add .removeExtension() to String
String.metaClass.mixin org.apache.commons.io.FilenameUtils
System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider")
System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true")
def cli = new CliBuilder(usage:"${appName} --file <file>")
cli.with {
f(longOpt: 'file', 'filepath', args: 1, required: true)
h(longOpt: 'help', 'Print help', required: false)
}
def opt = cli.parse(args)
if (!opt || opt.h || !opt.f) {
return
}
PDDocument doc = null
File f = new File(opt.f).getAbsoluteFile()
if (f.exists() && f.canRead()) {
println "Processing: file ${f}"
}
else {
println "Error: file ${f} not found or not readble"
return
}
def outFileTemplate = "${f.getPath().removeExtension()}-img_"
def totalImages = 0
try {
doc = PDDocument.load( f )
PDPageTree pageTree = doc.getDocumentCatalog().getPages()
for ( PDPage page : pageTree.iterator() ) {
PDResources pdResources = page.getResources();
for ( COSName xObjCosName : pdResources.getXObjectNames() ) {
PDXObject pdxObj = pdResources.getXObject(xObjCosName)
if ( pdxObj instanceof PDImageXObject ) {
PDImageXObject pdImageXObject = (PDImageXObject) pdxObj
BufferedImage bufferedImage = pdImageXObject.getImage()
File outFile = new File( "${outFileTemplate}${totalImages.toString().padLeft(4,'0')}.${fileType}" )
ImageIO.write( bufferedImage, fileType, outFile );
println( "Created outfile: ${outFile}")
totalImages++;
}
}
}
}
finally {
if( doc != null ) {
doc.close();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment