Created
September 10, 2018 12:49
-
-
Save marcgeld/715be4bead9c6a00dfad7424b070bfbc to your computer and use it in GitHub Desktop.
Groovy script that extracts images from a pdf file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env groovy | |
| //@GrabConfig(systemClassLoader=true) | |
| @Grab(group='ch.qos.logback', module='logback-classic', version='1.2.3') | |
| @Grab(group='org.apache.pdfbox', module='pdfbox', version='2.0.11') | |
| @Grab(group='commons-io', module='commons-io', version='2.6') | |
| import org.apache.pdfbox.pdfwriter.* | |
| import org.apache.pdfbox.pdmodel.* | |
| import org.apache.pdfbox.pdmodel.font.* | |
| import org.apache.pdfbox.pdmodel.edit.* | |
| import org.apache.pdfbox.pdmodel.graphics.* | |
| import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject | |
| import java.awt.image.BufferedImage | |
| import javax.imageio.ImageIO | |
| import org.apache.pdfbox.cos.COSName | |
| import java.nio.* | |
| def appName = this.getClass().getName() | |
| def fileType = "png" | |
| // Add .removeExtension() to String | |
| String.metaClass.mixin org.apache.commons.io.FilenameUtils | |
| System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider") | |
| System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true") | |
| def cli = new CliBuilder(usage:"${appName} --file <file>") | |
| cli.with { | |
| f(longOpt: 'file', 'filepath', args: 1, required: true) | |
| h(longOpt: 'help', 'Print help', required: false) | |
| } | |
| def opt = cli.parse(args) | |
| if (!opt || opt.h || !opt.f) { | |
| return | |
| } | |
| PDDocument doc = null | |
| File f = new File(opt.f).getAbsoluteFile() | |
| if (f.exists() && f.canRead()) { | |
| println "Processing: file ${f}" | |
| } | |
| else { | |
| println "Error: file ${f} not found or not readble" | |
| return | |
| } | |
| def outFileTemplate = "${f.getPath().removeExtension()}-img_" | |
| def totalImages = 0 | |
| try { | |
| doc = PDDocument.load( f ) | |
| PDPageTree pageTree = doc.getDocumentCatalog().getPages() | |
| for ( PDPage page : pageTree.iterator() ) { | |
| PDResources pdResources = page.getResources(); | |
| for ( COSName xObjCosName : pdResources.getXObjectNames() ) { | |
| PDXObject pdxObj = pdResources.getXObject(xObjCosName) | |
| if ( pdxObj instanceof PDImageXObject ) { | |
| PDImageXObject pdImageXObject = (PDImageXObject) pdxObj | |
| BufferedImage bufferedImage = pdImageXObject.getImage() | |
| File outFile = new File( "${outFileTemplate}${totalImages.toString().padLeft(4,'0')}.${fileType}" ) | |
| ImageIO.write( bufferedImage, fileType, outFile ); | |
| println( "Created outfile: ${outFile}") | |
| totalImages++; | |
| } | |
| } | |
| } | |
| } | |
| finally { | |
| if( doc != null ) { | |
| doc.close(); | |
| } | |
| } | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment