Created
February 24, 2015 04:02
-
-
Save kBashar/f5148b4d791e90db1ee4 to your computer and use it in GitHub Desktop.
This class extracts all images from a PDF file and save them in JPEG format using PDFBOX 1.8.8
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.kbashar.pdfboxtryout; | |
import org.apache.pdfbox.cos.COSBase; | |
import org.apache.pdfbox.cos.COSName; | |
import org.apache.pdfbox.pdmodel.PDDocument; | |
import org.apache.pdfbox.pdmodel.PDPage; | |
import org.apache.pdfbox.pdmodel.graphics.xobject.PDJpeg; | |
import org.apache.pdfbox.pdmodel.graphics.xobject.PDPixelMap; | |
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject; | |
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage; | |
import javax.imageio.ImageIO; | |
import java.io.FileOutputStream; | |
import java.io.IOException; | |
import java.util.Iterator; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.Set; | |
/** | |
* Created by kbashar on 2/24/15. | |
*/ | |
/** | |
* This class extracts all images from a PDF file and save them in JPEG format using PDFBOX 1.8.8 | |
*/ | |
public class ImageExtractor { | |
public static void main(String[] args) throws IOException { | |
PDDocument document = null; | |
if (args.length!=1) { | |
System.out.println("Please provide a PDF file name"); | |
return; | |
} | |
try { | |
document = PDDocument.load(args[0]); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
return; | |
} | |
//get all pages from a PDF document | |
List<PDPage> pages = document.getDocumentCatalog().getAllPages(); | |
int pageCount = 0; | |
for (PDPage page:pages) { | |
pageCount++; | |
Map<String, PDXObject> xobj = null; | |
//get the XObject from a page's resource dictionary | |
xobj = page.getResources().getXObjects(); | |
int imageCount =0; | |
for (Map.Entry<String,PDXObject> entry:xobj.entrySet()) { | |
PDXObject obj = entry.getValue(); | |
//XObject can be of various type like image, form. But we here only need Images. | |
if (obj instanceof PDXObjectImage) { | |
imageCount++; | |
String imageName = "page_"+pageCount+"image_"+imageCount+".jpeg"; | |
PDXObjectImage image = (PDXObjectImage) obj; | |
// this method might throw an exception javax.imageio.IIOException in OPEN JDK | |
ImageIO.write( | |
image.getRGBImage(), | |
"JPEG", | |
new FileOutputStream(imageName) | |
); | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
How to extract images in the proper order (left-to-right, top-to-bottom)?