Skip to content

Instantly share code, notes, and snippets.

@bdkosher
Last active August 29, 2015 14:10
Show Gist options
  • Save bdkosher/baad345f0756a30f2156 to your computer and use it in GitHub Desktop.
Save bdkosher/baad345f0756a30f2156 to your computer and use it in GitHub Desktop.
Crude Java class (depends on pdfbox-1.8.5 and common-logging) to recursively scan directories of PDF files to determine if they contain text or not (or are non-PDF files)
package prpsutil;
import java.io.*;
import java.util.*;
import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.pdmodel.font.*;
public class PDFScanner {
private static final byte[] expectedMagicNumber = "%PDF".getBytes();
private static final FileFilter FILES = new FileFilter() {
@Override
public boolean accept(File file) {
return file.isFile();
}
};
private static final FileFilter DIRECTORIES = new FileFilter() {
@Override
public boolean accept(File file) {
return file.isDirectory();
}
};
private static class PdfMeta {
public int pages;
public boolean textual;
public long fileSize;
}
public static void main(String[] args) {
if (args.length < 1) {
throw new IllegalArgumentException("Usage: [input dir]");
}
String inDir = args[0];
File dir = new File(inDir);
if (dir.isFile()) {
analyzeFile(dir);
} else {
analyzeDirectory(dir);
}
}
static void analyzeDirectory(File dir) {
File[] files = dir.listFiles(FILES);
if (files != null) {
for (File file : files) {
analyzeFile(file);
}
}
File[] dirs = dir.listFiles(DIRECTORIES);
if (dirs != null) {
for (File dirFile : dirs) {
analyzeDirectory(dirFile);
}
}
}
static void analyzeFile(File file) {
try {
if (isPdf(file)) {
PdfMeta meta = analyzePdf(file);
if (!meta.textual) {
System.out.print(file.getAbsolutePath() + ",IMAGEPDF,");
} else {
System.out.print(file.getAbsolutePath() + ",TEXTPDF,");
}
System.out.println(meta.pages + "," + meta.fileSize);
} else {
System.out.println(file.getAbsolutePath() + ",NONPDF,,");
}
} catch (Exception e) {
System.out.print(file.getAbsolutePath() + ",ERROR,,");
}
}
static boolean isPdf(File file) throws IOException {
byte[] actualMagicNumber = new byte[expectedMagicNumber.length];
InputStream in = null;
try {
in = new FileInputStream(file);
in.read(actualMagicNumber);
for (int i = 0; i < expectedMagicNumber.length; ++i) {
if (actualMagicNumber[i] != expectedMagicNumber[i]) {
return false;
}
}
return true;
} finally {
try {
in.close();
} catch (IOException e) {
// ignore
}
}
}
static PdfMeta analyzePdf(File pdfFile) throws Exception {
PdfMeta meta = new PdfMeta();
PDDocument doc = PDDocument.load(pdfFile);
try {
List<PDPage> pages = doc.getDocumentCatalog().getAllPages();
meta.textual = isTextual(pages);
meta.pages = pages.size();
meta.fileSize = pdfFile.length();
return meta;
} finally {
doc.close();
}
}
static boolean isTextual(List<PDPage> pages) {
if (pages != null) {
for (PDPage page : pages) {
Map<String, PDFont> pageFonts = page.getResources().getFonts();
if (!pageFonts.isEmpty()) {
return true;
}
}
}
return false;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment