M1ndBlast · November 28, 2024 05:28
diff --git a/pdf2txt.html b/pdf2txt.html
 <!DOCTYPE html>
 <html lang="en">
 	<head>
 		<meta charset="UTF-8" />
 		<meta http-equiv="X-UA-Compatible" content="IE=edge" />
 		<meta name="viewport" content="width=device-width, initial-scale=1.0" />
 		<title>PDF Decoder to Text</title>
 		<script src="https://cdn.jsdelivr.net/npm/[email protected]/build/pdf.min.js"></script>
 		<script src="pdf2txt.multiple-1.0.js"></script>
 	</head>
 	<body></body>
 </html>
diff --git a/pdf2txt.multiple-1.0.js b/pdf2txt.multiple-1.0.js
 let PDFJS = window['pdfjs-dist/build/pdf']; // Loaded via <script> tag, create shortcut to access PDF.js exports.
 PDFJS.GlobalWorkerOptions.workerSrc = 'https://cdn.jsdelivr.net/npm/[email protected]/build/pdf.worker.min.js'; // The workerSrc property shall be specified.
 function Pdf2TextClass() {
 	var self = this;
 	this.complete = 0;

 	/**
 	 * @param data ArrayBuffer of the pdf file content
 	 * @param callbackPageDone To inform the progress each time
 	 *        when a page is finished. The callback function's input parameters are:
 	 *        1) number of pages done;
 	 *        2) total number of pages in file.
 	 * @param callbackAllDone The input parameter of callback function is 
 	 *        the result of extracted text from pdf file.
 	 */
 	this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
 		console.assert(data instanceof ArrayBuffer || typeof data == 'string', 'No data type allowed');
 		PDFJS.getDocument(data).promise.then(function (pdf) {
 			var div = document.getElementById('viewer');

 			var total = pdf.numPages;
 			callbackPageDone(0, total);
 			var layers = {};
 			for (i = 1; i <= total; i++) {
 				pdf.getPage(i).then(function (page) {
 					var n = page.pageNumber;
 					page.getTextContent().then(function (textContent) {
 						if (null != textContent.items) {
 							var page_text = "";
 							var last_block = null;
 							for (var k = 0; k < textContent.items.length; k++) {
 								var block = textContent.items[k];
 								if (last_block != null && last_block.str[last_block.str.length - 1] != ' ')
 									if (block.x < last_block.x)
 										page_text += "\r\n";
 									else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
 										page_text += ' ';
 								page_text += block.str;
 								last_block = block;
 							}

 							textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
 							layers[n] = page_text + "\n\n";
 						}
 						++self.complete;
 						callbackPageDone(self.complete, total);
 						if (self.complete == total) {
 							window.setTimeout(function () {
 								var full_text = "";
 								var num_pages = Object.keys(layers).length;
 								for (var j = 1; j <= num_pages; j++)
 									full_text += layers[j];
 								callbackAllDone(full_text);
 							}, 1000);
 						}
 					}); // end  of page.getTextContent().then
 				}); // end of page.then
 			} // of for
 		});
 	}; // end of pdfToText()
 }; // end of class


 let input = document.createElement('input')
 input.type = 'file'
 input.multiple = true
 input.accept = 'application/pdf'
 input.addEventListener('input', function () {
 	for (const file of this.files) {
 		let reader = new FileReader();
 		reader.onload = function () {
 			let arrayBuffer = this.result

 			new Pdf2TextClass().pdfToText(arrayBuffer, (pagNum, tPagNum) => { console.log(`${pagNum}/${tPagNum}`); }, text => {
 				let downloadLink = document.createElement("a");
 				downloadLink.href = 'data:text/txt;charset=utf-8,' + text;;
 				downloadLink.download = file.name.substr(0, file.name.lastIndexOf('.')) + ".txt";

 				document.body.appendChild(downloadLink);
 				downloadLink.click();
 				downloadLink.remove()
 			})
 		}
 		reader.readAsArrayBuffer(file);
 	}
 })

 document.addEventListener("DOMContentLoaded", _ =>
 	document.querySelector('body').append(input))
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8" />
	<meta http-equiv="X-UA-Compatible" content="IE=edge" />
	<meta name="viewport" content="width=device-width, initial-scale=1.0" />
	<title>PDF Decoder to Text</title>
	<script src="https://cdn.jsdelivr.net/npm/[email protected]/build/pdf.min.js"></script>
	<script src="pdf2txt.multiple-1.0.js"></script>
	</head>
	<body></body>
	</html>
	let PDFJS = window['pdfjs-dist/build/pdf']; // Loaded via <script> tag, create shortcut to access PDF.js exports.
	PDFJS.GlobalWorkerOptions.workerSrc = 'https://cdn.jsdelivr.net/npm/[email protected]/build/pdf.worker.min.js'; // The workerSrc property shall be specified.
	function Pdf2TextClass() {
	var self = this;
	this.complete = 0;

	/**
	* @param data ArrayBuffer of the pdf file content
	* @param callbackPageDone To inform the progress each time
	* when a page is finished. The callback function's input parameters are:
	* 1) number of pages done;
	* 2) total number of pages in file.
	* @param callbackAllDone The input parameter of callback function is
	* the result of extracted text from pdf file.
	*/
	this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
	console.assert(data instanceof ArrayBuffer \|\| typeof data == 'string', 'No data type allowed');
	PDFJS.getDocument(data).promise.then(function (pdf) {
	var div = document.getElementById('viewer');

	var total = pdf.numPages;
	callbackPageDone(0, total);
	var layers = {};
	for (i = 1; i <= total; i++) {
	pdf.getPage(i).then(function (page) {
	var n = page.pageNumber;
	page.getTextContent().then(function (textContent) {
	if (null != textContent.items) {
	var page_text = "";
	var last_block = null;
	for (var k = 0; k < textContent.items.length; k++) {
	var block = textContent.items[k];
	if (last_block != null && last_block.str[last_block.str.length - 1] != ' ')
	if (block.x < last_block.x)
	page_text += "\r\n";
	else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$\|^(.+\s[a-zA-Z])$/) == null))
	page_text += ' ';
	page_text += block.str;
	last_block = block;
	}

	textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
	layers[n] = page_text + "\n\n";
	}
	++self.complete;
	callbackPageDone(self.complete, total);
	if (self.complete == total) {
	window.setTimeout(function () {
	var full_text = "";
	var num_pages = Object.keys(layers).length;
	for (var j = 1; j <= num_pages; j++)
	full_text += layers[j];
	callbackAllDone(full_text);
	}, 1000);
	}
	}); // end of page.getTextContent().then
	}); // end of page.then
	} // of for
	});
	}; // end of pdfToText()
	}; // end of class


	let input = document.createElement('input')
	input.type = 'file'
	input.multiple = true
	input.accept = 'application/pdf'
	input.addEventListener('input', function () {
	for (const file of this.files) {
	let reader = new FileReader();
	reader.onload = function () {
	let arrayBuffer = this.result

	new Pdf2TextClass().pdfToText(arrayBuffer, (pagNum, tPagNum) => { console.log(`${pagNum}/${tPagNum}`); }, text => {
	let downloadLink = document.createElement("a");
	downloadLink.href = 'data:text/txt;charset=utf-8,' + text;;
	downloadLink.download = file.name.substr(0, file.name.lastIndexOf('.')) + ".txt";

	document.body.appendChild(downloadLink);
	downloadLink.click();
	downloadLink.remove()
	})
	}
	reader.readAsArrayBuffer(file);
	}
	})

	document.addEventListener("DOMContentLoaded", _ =>
	document.querySelector('body').append(input))