Skip to content

Instantly share code, notes, and snippets.

@joshdurbin
Created March 26, 2019 15:53
Show Gist options
  • Save joshdurbin/f3d865246d8afac1fcfbce76bd479e4a to your computer and use it in GitHub Desktop.
Save joshdurbin/f3d865246d8afac1fcfbce76bd479e4a to your computer and use it in GitHub Desktop.
PDF searcher
@Grapes([
@Grab(group='org.apache.pdfbox', module='pdfbox', version='2.0.13'),
@Grab(group='org.apache.tika', module='tika-core', version='1.18'),
@Grab(group='me.xdrop', module='fuzzywuzzy', version='1.2.0')
])
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.text.PDFTextStripper
import org.apache.tika.Tika
import java.util.concurrent.ConcurrentLinkedQueue
import java.util.concurrent.CountDownLatch
import java.util.concurrent.ThreadPoolExecutor
import java.util.concurrent.TimeUnit
import java.util.concurrent.LinkedBlockingQueue
import java.util.concurrent.ThreadFactory
import java.util.Random
import java.lang.Thread.State
import java.lang.ThreadLocal
import groovy.transform.Canonical
import groovy.transform.CompileStatic
import static groovy.io.FileType.FILES
def cli = new CliBuilder(header: 'PDF Text Searcher', usage:'./pdfSearcher <directoryToScan> <searchTerm>', width: 100)
def cliOptions = cli.parse(args)
if (cliOptions.help || cliOptions.arguments().size() != 2) {
cli.usage()
System.exit(0)
}
@Canonical
@CompileStatic
class ProcessedFile {
String name
String lineData
}
@CompileStatic
class TextExtractor {
static String requiredMimeType = 'application/pdf'
Tika tika = new Tika()
PDFTextStripper pdfStripper = new PDFTextStripper()
Queue<ProcessedFile> queue
TextExtractor(Queue<ProcessedFile> queue) {
this.queue = queue
}
void processFile(File file) {
try {
if (tika.detect(file) == requiredMimeType) {
PDDocument document = PDDocument.load(file)
String text = pdfStripper.getText(document)
queue.offer(new ProcessedFile(file.path, text))
document.close()
}
} catch (Exception e) {
println "Error processing ${file.path}, ${e}"
}
}
}
@CompileStatic
class ReusableThread extends Thread {
static ThreadLocal<TextExtractor> extractor = ThreadLocal.withInitial({
new TextExtractor(new ConcurrentLinkedQueue())
})
ReusableThread(Runnable runnable, Queue<ProcessedFile> queue) {
super(runnable)
ReusableThread.extractor.set(new TextExtractor(queue))
}
void run() {
super.run()
}
}
@CompileStatic
class ReusableThreadFactory implements ThreadFactory {
Queue<ProcessedFile> queue
ReusableThreadFactory(Queue<ProcessedFile> queue) {
this.queue = queue
}
Thread newThread(Runnable runnable) {
new ReusableThread(runnable, queue)
}
}
Queue<ProcessedFile> processedFiles = new ConcurrentLinkedQueue()
Thread loaderThread = Thread.start {
def executor = new ThreadPoolExecutor(10, 25, 5, TimeUnit.SECONDS, new LinkedBlockingQueue(100), new ReusableThreadFactory(processedFiles), new ThreadPoolExecutor.CallerRunsPolicy())
new File(cliOptions.arguments().first()).eachFileRecurse(FILES) { file ->
executor.submit {
ReusableThread.extractor.get().processFile(file)
}
}
executor.shutdown()
}
String searchTerm = (cliOptions.arguments().last() as String).toLowerCase()
println "Searching pdfs for term '${searchTerm}'..."
while (loaderThread.getState() != State.TERMINATED) {
ProcessedFile processedFile = processedFiles.poll()
if (processedFile) {
processedFile.lineData.eachLine { String line ->
println line
}
}
}
loaderThread.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment