Created
September 18, 2019 20:29
-
-
Save benjaminkreen/695a6af03d62a20081dd5d234944994e to your computer and use it in GitHub Desktop.
For doing multiple searches on a pdf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html lang="en" > | |
<head> | |
<meta charset="UTF-8"> | |
<title>PDF searcher</title> | |
</head> | |
<body> | |
<label for="your-pdf">Select your pdf</label> | |
<input type="file" id="your-pdf" accept=".pdf" onchange="importFile()"> | |
<label for="search">Your comma-separated search</label> | |
<textarea id="search" placeholder="monkeys,human embryos,nuclear,pathogens"></textarea> | |
<input type="button" value="Search!" onclick="doSearch()"> | |
<div id="viewerContainer"> </div> | |
<div id="viewer"></div> | |
<script src="https://cdn.jsdelivr.net/npm/[email protected]/build/pdf.min.js" integrity="sha256-KaZ7ItAt0yEJuMYiUCOs8vQBDPjrZu91EIfgBMTEzKA=" crossorigin="anonymous"></script> | |
<script> | |
function doSearch() { | |
var value = document.getElementById("search").value | |
var queries = value.split(',').reduce(function(memo, q) { | |
var formattedQuery = q.toLowerCase().replace(/\W/g, ''); | |
var val = { query: q, pages: [], count: 0 } | |
memo[formattedQuery] = val; | |
return memo; | |
}, {}) | |
Object.keys(queries).forEach(function(q) { | |
for (i = 0; i < window.searchText.length; i++) { | |
var matches = window.searchText[i].match(new RegExp(q, 'g')) | |
if (matches) { | |
queries[q].pages.push(i + 1) | |
queries[q].count += matches.length | |
} | |
} | |
}) | |
console.log(queries) | |
var alertMsgs = Object.values(queries).map(function(q) { | |
return q.query + ' has ' + q.count +' hits. On page(s):' + q.pages.join(', '); | |
}) | |
alert(alertMsgs.join("\n")) | |
} | |
function importFile() { | |
var input = document.getElementById('your-pdf') | |
// from https://stackoverflow.com/a/28567893 | |
var file = input.files[0]; | |
var fileReader = new FileReader(); | |
fileReader.onload = function() { | |
window.searchText = []; | |
//Step 4:turn array buffer into typed array | |
var typedarray = new Uint8Array(this.result); | |
//Step 5:PDFJS should be able to read this | |
pdfjsLib.getDocument(typedarray).then(function(pdf) { | |
// probably disable search button | |
var promises = [] | |
for (i = 1; i < pdf.numPages; i++) { | |
var textPromise = pdf.getPage(i) | |
.then(function(page) { | |
return page.getTextContent() | |
}) | |
.then(function(textObject) { | |
return textObject.items.reduce(function(memo, item){ | |
return memo += item.str.toLowerCase().replace(/\W/g, ''); | |
}, '') | |
}) | |
promises.push(textPromise) | |
} | |
Promise.all(promises).then(function(result){ | |
window.searchText = result; | |
}) | |
}); | |
}; | |
//Step 3:Read the file as ArrayBuffer | |
fileReader.readAsArrayBuffer(file); | |
} | |
</script> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment