Last active
August 29, 2015 14:10
-
-
Save bdkosher/baad345f0756a30f2156 to your computer and use it in GitHub Desktop.
Crude Java class (depends on pdfbox-1.8.5 and common-logging) to recursively scan directories of PDF files to determine if they contain text or not (or are non-PDF files)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package prpsutil; | |
import java.io.*; | |
import java.util.*; | |
import org.apache.pdfbox.pdmodel.*; | |
import org.apache.pdfbox.pdmodel.font.*; | |
public class PDFScanner { | |
private static final byte[] expectedMagicNumber = "%PDF".getBytes(); | |
private static final FileFilter FILES = new FileFilter() { | |
@Override | |
public boolean accept(File file) { | |
return file.isFile(); | |
} | |
}; | |
private static final FileFilter DIRECTORIES = new FileFilter() { | |
@Override | |
public boolean accept(File file) { | |
return file.isDirectory(); | |
} | |
}; | |
private static class PdfMeta { | |
public int pages; | |
public boolean textual; | |
public long fileSize; | |
} | |
public static void main(String[] args) { | |
if (args.length < 1) { | |
throw new IllegalArgumentException("Usage: [input dir]"); | |
} | |
String inDir = args[0]; | |
File dir = new File(inDir); | |
if (dir.isFile()) { | |
analyzeFile(dir); | |
} else { | |
analyzeDirectory(dir); | |
} | |
} | |
static void analyzeDirectory(File dir) { | |
File[] files = dir.listFiles(FILES); | |
if (files != null) { | |
for (File file : files) { | |
analyzeFile(file); | |
} | |
} | |
File[] dirs = dir.listFiles(DIRECTORIES); | |
if (dirs != null) { | |
for (File dirFile : dirs) { | |
analyzeDirectory(dirFile); | |
} | |
} | |
} | |
static void analyzeFile(File file) { | |
try { | |
if (isPdf(file)) { | |
PdfMeta meta = analyzePdf(file); | |
if (!meta.textual) { | |
System.out.print(file.getAbsolutePath() + ",IMAGEPDF,"); | |
} else { | |
System.out.print(file.getAbsolutePath() + ",TEXTPDF,"); | |
} | |
System.out.println(meta.pages + "," + meta.fileSize); | |
} else { | |
System.out.println(file.getAbsolutePath() + ",NONPDF,,"); | |
} | |
} catch (Exception e) { | |
System.out.print(file.getAbsolutePath() + ",ERROR,,"); | |
} | |
} | |
static boolean isPdf(File file) throws IOException { | |
byte[] actualMagicNumber = new byte[expectedMagicNumber.length]; | |
InputStream in = null; | |
try { | |
in = new FileInputStream(file); | |
in.read(actualMagicNumber); | |
for (int i = 0; i < expectedMagicNumber.length; ++i) { | |
if (actualMagicNumber[i] != expectedMagicNumber[i]) { | |
return false; | |
} | |
} | |
return true; | |
} finally { | |
try { | |
in.close(); | |
} catch (IOException e) { | |
// ignore | |
} | |
} | |
} | |
static PdfMeta analyzePdf(File pdfFile) throws Exception { | |
PdfMeta meta = new PdfMeta(); | |
PDDocument doc = PDDocument.load(pdfFile); | |
try { | |
List<PDPage> pages = doc.getDocumentCatalog().getAllPages(); | |
meta.textual = isTextual(pages); | |
meta.pages = pages.size(); | |
meta.fileSize = pdfFile.length(); | |
return meta; | |
} finally { | |
doc.close(); | |
} | |
} | |
static boolean isTextual(List<PDPage> pages) { | |
if (pages != null) { | |
for (PDPage page : pages) { | |
Map<String, PDFont> pageFonts = page.getResources().getFonts(); | |
if (!pageFonts.isEmpty()) { | |
return true; | |
} | |
} | |
} | |
return false; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment