Skip to content

Instantly share code, notes, and snippets.

@kBashar
Created March 4, 2015 08:59
Show Gist options
  • Save kBashar/b435a7cf4442406c776b to your computer and use it in GitHub Desktop.
Save kBashar/b435a7cf4442406c776b to your computer and use it in GitHub Desktop.
This Class extract Stream Objects from a PDF a document and save them in files in unfiltered state
package com.kbashar.pdfboxtry;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import sun.misc.IOUtils;
import java.io.*;
import java.util.List;
/**
* Created by kbashar on 3/4/15.
*/
public class StreamSaver {
public void saveStreams(String fileName) throws IOException {
String rawFileName = "raw_";
String filteredFileName = "filtered_";
PDFParser parser = new PDFParser(new FileInputStream(fileName));
parser.parse();
List<COSObject> objects = parser.getDocument().getObjects();
int count =0;
for (COSObject object : objects) {
COSBase baseObject = object.getObject();
if (baseObject instanceof COSStream) {
COSStream stream = (COSStream) baseObject;
FileOutputStream file = new FileOutputStream(rawFileName+count++);
byte[] bytes = org.apache.pdfbox.io.IOUtils.toByteArray(stream.getUnfilteredStream());
file.write(bytes);
file.close();
}
}
}
public static void main(String[] args) {
StreamSaver ss = new StreamSaver();
if (args.length == 1) {
try {
ss.saveStreams(args[0]);
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment