Created
March 26, 2019 15:53
-
-
Save joshdurbin/f3d865246d8afac1fcfbce76bd479e4a to your computer and use it in GitHub Desktop.
PDF searcher
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@Grapes([ | |
@Grab(group='org.apache.pdfbox', module='pdfbox', version='2.0.13'), | |
@Grab(group='org.apache.tika', module='tika-core', version='1.18'), | |
@Grab(group='me.xdrop', module='fuzzywuzzy', version='1.2.0') | |
]) | |
import org.apache.pdfbox.pdmodel.PDDocument | |
import org.apache.pdfbox.text.PDFTextStripper | |
import org.apache.tika.Tika | |
import java.util.concurrent.ConcurrentLinkedQueue | |
import java.util.concurrent.CountDownLatch | |
import java.util.concurrent.ThreadPoolExecutor | |
import java.util.concurrent.TimeUnit | |
import java.util.concurrent.LinkedBlockingQueue | |
import java.util.concurrent.ThreadFactory | |
import java.util.Random | |
import java.lang.Thread.State | |
import java.lang.ThreadLocal | |
import groovy.transform.Canonical | |
import groovy.transform.CompileStatic | |
import static groovy.io.FileType.FILES | |
def cli = new CliBuilder(header: 'PDF Text Searcher', usage:'./pdfSearcher <directoryToScan> <searchTerm>', width: 100) | |
def cliOptions = cli.parse(args) | |
if (cliOptions.help || cliOptions.arguments().size() != 2) { | |
cli.usage() | |
System.exit(0) | |
} | |
@Canonical | |
@CompileStatic | |
class ProcessedFile { | |
String name | |
String lineData | |
} | |
@CompileStatic | |
class TextExtractor { | |
static String requiredMimeType = 'application/pdf' | |
Tika tika = new Tika() | |
PDFTextStripper pdfStripper = new PDFTextStripper() | |
Queue<ProcessedFile> queue | |
TextExtractor(Queue<ProcessedFile> queue) { | |
this.queue = queue | |
} | |
void processFile(File file) { | |
try { | |
if (tika.detect(file) == requiredMimeType) { | |
PDDocument document = PDDocument.load(file) | |
String text = pdfStripper.getText(document) | |
queue.offer(new ProcessedFile(file.path, text)) | |
document.close() | |
} | |
} catch (Exception e) { | |
println "Error processing ${file.path}, ${e}" | |
} | |
} | |
} | |
@CompileStatic | |
class ReusableThread extends Thread { | |
static ThreadLocal<TextExtractor> extractor = ThreadLocal.withInitial({ | |
new TextExtractor(new ConcurrentLinkedQueue()) | |
}) | |
ReusableThread(Runnable runnable, Queue<ProcessedFile> queue) { | |
super(runnable) | |
ReusableThread.extractor.set(new TextExtractor(queue)) | |
} | |
void run() { | |
super.run() | |
} | |
} | |
@CompileStatic | |
class ReusableThreadFactory implements ThreadFactory { | |
Queue<ProcessedFile> queue | |
ReusableThreadFactory(Queue<ProcessedFile> queue) { | |
this.queue = queue | |
} | |
Thread newThread(Runnable runnable) { | |
new ReusableThread(runnable, queue) | |
} | |
} | |
Queue<ProcessedFile> processedFiles = new ConcurrentLinkedQueue() | |
Thread loaderThread = Thread.start { | |
def executor = new ThreadPoolExecutor(10, 25, 5, TimeUnit.SECONDS, new LinkedBlockingQueue(100), new ReusableThreadFactory(processedFiles), new ThreadPoolExecutor.CallerRunsPolicy()) | |
new File(cliOptions.arguments().first()).eachFileRecurse(FILES) { file -> | |
executor.submit { | |
ReusableThread.extractor.get().processFile(file) | |
} | |
} | |
executor.shutdown() | |
} | |
String searchTerm = (cliOptions.arguments().last() as String).toLowerCase() | |
println "Searching pdfs for term '${searchTerm}'..." | |
while (loaderThread.getState() != State.TERMINATED) { | |
ProcessedFile processedFile = processedFiles.poll() | |
if (processedFile) { | |
processedFile.lineData.eachLine { String line -> | |
println line | |
} | |
} | |
} | |
loaderThread.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment