experiments using pdf parsers
https://www.npmjs.com/package/pdf2json https://www.npmjs.com/package/pdf-parse https://www.pdfparser.org/demo
pdf2json provides good text extraction + the positioning
| .idea | 
experiments using pdf parsers
https://www.npmjs.com/package/pdf2json https://www.npmjs.com/package/pdf-parse https://www.pdfparser.org/demo
pdf2json provides good text extraction + the positioning
| const fs = require('fs'); | |
| const pdf = require('pdf-parse'); | |
| const filename = '/media/hassen/linux-tmp/web_dld/spiro.pdf' | |
| const PDFParser = require("pdf2json"); | |
| function t4() | |
| { | |
| let pdfParser = new PDFParser(this, 1); | |
| pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError)); | |
| pdfParser.on("pdfParser_dataReady", pdfData => { | |
| fs.writeFileSync('/tmp/gg.txt', pdfParser.getRawTextContent()) | |
| }); | |
| pdfParser.loadPDF(filename); | |
| // let txt = pdfParser.getRawTextContent(filename) | |
| // console.log(txt) | |
| // getRawTextContent | |
| } | |
| function t3() | |
| { | |
| let pdfParser = new PDFParser(); | |
| pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError)); | |
| pdfParser.on("pdfParser_dataReady", pdfData => { | |
| console.log(JSON.stringify(pdfData)) | |
| fs.writeFileSync('/tmp/gg.json', JSON.stringify(pdfData)) | |
| // console.log(pdfData.formImage.Pages) | |
| // fs.writeFile("/tmp/gg.json", JSON.stringify(pdfData)); | |
| }); | |
| pdfParser.loadPDF(filename); | |
| } | |
| function t2() | |
| { | |
| // default render callback | |
| function render_page(pageData) | |
| { | |
| //check documents https://mozilla.github.io/pdf.js/ | |
| let render_options = { | |
| //replaces all occurrences of whitespace with standard spaces (0x20). The default value is `false`. | |
| normalizeWhitespace: false, | |
| //do not attempt to combine same line TextItem's. The default value is `false`. | |
| disableCombineTextItems: false | |
| } | |
| return pageData.getTextContent(render_options) | |
| .then(function(textContent){ | |
| let lastY, text = ''; | |
| for(let item of textContent.items) | |
| { | |
| if(lastY == item.transform[5] || !lastY) | |
| { | |
| text += item.str; | |
| } | |
| else | |
| { | |
| text += '\n' + item.str; | |
| } | |
| lastY = item.transform[5]; | |
| } | |
| return text; | |
| }); | |
| } | |
| let options = { | |
| pagerender: render_page | |
| } | |
| let dataBuffer = fs.readFileSync(filename); | |
| pdf(dataBuffer, options).then(function(data){ | |
| //use new format | |
| console.log(data, data.text) | |
| }); | |
| } | |
| function t1() | |
| { | |
| let dataBuffer = fs.readFileSync(filename); | |
| pdf(dataBuffer).then(function(data){ | |
| // number of pages | |
| console.log(data.numpages); | |
| // number of rendered pages | |
| console.log(data.numrender); | |
| // PDF info | |
| console.log(data.info); | |
| // PDF metadata | |
| console.log(data.metadata); | |
| // PDF.js version | |
| // check https://mozilla.github.io/pdf.js/getting_started/ | |
| console.log(data.version); | |
| // PDF text | |
| console.log(data.text); | |
| }); | |
| } | |
| // t1() | |
| // t2() | |
| // t3() | |
| t4() | |