Last active
November 28, 2024 05:28
-
-
Save M1ndBlast/8dc4ecf3fa684d1111350ca0139812fc to your computer and use it in GitHub Desktop.
PDF Decoder to Text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8" /> | |
<meta http-equiv="X-UA-Compatible" content="IE=edge" /> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> | |
<title>PDF Decoder to Text</title> | |
<script src="https://cdn.jsdelivr.net/npm/[email protected]/build/pdf.min.js"></script> | |
<script src="pdf2txt.multiple-1.0.js"></script> | |
</head> | |
<body></body> | |
</html> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
let PDFJS = window['pdfjs-dist/build/pdf']; // Loaded via <script> tag, create shortcut to access PDF.js exports. | |
PDFJS.GlobalWorkerOptions.workerSrc = 'https://cdn.jsdelivr.net/npm/[email protected]/build/pdf.worker.min.js'; // The workerSrc property shall be specified. | |
function Pdf2TextClass() { | |
var self = this; | |
this.complete = 0; | |
/** | |
* @param data ArrayBuffer of the pdf file content | |
* @param callbackPageDone To inform the progress each time | |
* when a page is finished. The callback function's input parameters are: | |
* 1) number of pages done; | |
* 2) total number of pages in file. | |
* @param callbackAllDone The input parameter of callback function is | |
* the result of extracted text from pdf file. | |
*/ | |
this.pdfToText = function (data, callbackPageDone, callbackAllDone) { | |
console.assert(data instanceof ArrayBuffer || typeof data == 'string', 'No data type allowed'); | |
PDFJS.getDocument(data).promise.then(function (pdf) { | |
var div = document.getElementById('viewer'); | |
var total = pdf.numPages; | |
callbackPageDone(0, total); | |
var layers = {}; | |
for (i = 1; i <= total; i++) { | |
pdf.getPage(i).then(function (page) { | |
var n = page.pageNumber; | |
page.getTextContent().then(function (textContent) { | |
if (null != textContent.items) { | |
var page_text = ""; | |
var last_block = null; | |
for (var k = 0; k < textContent.items.length; k++) { | |
var block = textContent.items[k]; | |
if (last_block != null && last_block.str[last_block.str.length - 1] != ' ') | |
if (block.x < last_block.x) | |
page_text += "\r\n"; | |
else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null)) | |
page_text += ' '; | |
page_text += block.str; | |
last_block = block; | |
} | |
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text); | |
layers[n] = page_text + "\n\n"; | |
} | |
++self.complete; | |
callbackPageDone(self.complete, total); | |
if (self.complete == total) { | |
window.setTimeout(function () { | |
var full_text = ""; | |
var num_pages = Object.keys(layers).length; | |
for (var j = 1; j <= num_pages; j++) | |
full_text += layers[j]; | |
callbackAllDone(full_text); | |
}, 1000); | |
} | |
}); // end of page.getTextContent().then | |
}); // end of page.then | |
} // of for | |
}); | |
}; // end of pdfToText() | |
}; // end of class | |
let input = document.createElement('input') | |
input.type = 'file' | |
input.multiple = true | |
input.accept = 'application/pdf' | |
input.addEventListener('input', function () { | |
for (const file of this.files) { | |
let reader = new FileReader(); | |
reader.onload = function () { | |
let arrayBuffer = this.result | |
new Pdf2TextClass().pdfToText(arrayBuffer, (pagNum, tPagNum) => { console.log(`${pagNum}/${tPagNum}`); }, text => { | |
let downloadLink = document.createElement("a"); | |
downloadLink.href = 'data:text/txt;charset=utf-8,' + text;; | |
downloadLink.download = file.name.substr(0, file.name.lastIndexOf('.')) + ".txt"; | |
document.body.appendChild(downloadLink); | |
downloadLink.click(); | |
downloadLink.remove() | |
}) | |
} | |
reader.readAsArrayBuffer(file); | |
} | |
}) | |
document.addEventListener("DOMContentLoaded", _ => | |
document.querySelector('body').append(input)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment