Skip to content

Instantly share code, notes, and snippets.

@benjaminkreen
Created September 18, 2019 20:29
Show Gist options
  • Save benjaminkreen/695a6af03d62a20081dd5d234944994e to your computer and use it in GitHub Desktop.
Save benjaminkreen/695a6af03d62a20081dd5d234944994e to your computer and use it in GitHub Desktop.
For doing multiple searches on a pdf
<!DOCTYPE html>
<html lang="en" >
<head>
<meta charset="UTF-8">
<title>PDF searcher</title>
</head>
<body>
<label for="your-pdf">Select your pdf</label>
<input type="file" id="your-pdf" accept=".pdf" onchange="importFile()">
<label for="search">Your comma-separated search</label>
<textarea id="search" placeholder="monkeys,human embryos,nuclear,pathogens"></textarea>
<input type="button" value="Search!" onclick="doSearch()">
<div id="viewerContainer"> </div>
<div id="viewer"></div>
<script src="https://cdn.jsdelivr.net/npm/[email protected]/build/pdf.min.js" integrity="sha256-KaZ7ItAt0yEJuMYiUCOs8vQBDPjrZu91EIfgBMTEzKA=" crossorigin="anonymous"></script>
<script>
function doSearch() {
var value = document.getElementById("search").value
var queries = value.split(',').reduce(function(memo, q) {
var formattedQuery = q.toLowerCase().replace(/\W/g, '');
var val = { query: q, pages: [], count: 0 }
memo[formattedQuery] = val;
return memo;
}, {})
Object.keys(queries).forEach(function(q) {
for (i = 0; i < window.searchText.length; i++) {
var matches = window.searchText[i].match(new RegExp(q, 'g'))
if (matches) {
queries[q].pages.push(i + 1)
queries[q].count += matches.length
}
}
})
console.log(queries)
var alertMsgs = Object.values(queries).map(function(q) {
return q.query + ' has ' + q.count +' hits. On page(s):' + q.pages.join(', ');
})
alert(alertMsgs.join("\n"))
}
function importFile() {
var input = document.getElementById('your-pdf')
// from https://stackoverflow.com/a/28567893
var file = input.files[0];
var fileReader = new FileReader();
fileReader.onload = function() {
window.searchText = [];
//Step 4:turn array buffer into typed array
var typedarray = new Uint8Array(this.result);
//Step 5:PDFJS should be able to read this
pdfjsLib.getDocument(typedarray).then(function(pdf) {
// probably disable search button
var promises = []
for (i = 1; i < pdf.numPages; i++) {
var textPromise = pdf.getPage(i)
.then(function(page) {
return page.getTextContent()
})
.then(function(textObject) {
return textObject.items.reduce(function(memo, item){
return memo += item.str.toLowerCase().replace(/\W/g, '');
}, '')
})
promises.push(textPromise)
}
Promise.all(promises).then(function(result){
window.searchText = result;
})
});
};
//Step 3:Read the file as ArrayBuffer
fileReader.readAsArrayBuffer(file);
}
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment