Skip to content

Instantly share code, notes, and snippets.

@hbt
Created December 17, 2019 22:07
Show Gist options
  • Save hbt/8a2670a98e60a02bf824914cd42da366 to your computer and use it in GitHub Desktop.
Save hbt/8a2670a98e60a02bf824914cd42da366 to your computer and use it in GitHub Desktop.
const fs = require('fs');
const pdf = require('pdf-parse');
const filename = '/media/hassen/linux-tmp/web_dld/spiro.pdf'
const PDFParser = require("pdf2json");
function t4()
{
let pdfParser = new PDFParser(this, 1);
pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError));
pdfParser.on("pdfParser_dataReady", pdfData => {
fs.writeFileSync('/tmp/gg.txt', pdfParser.getRawTextContent())
});
pdfParser.loadPDF(filename);
// let txt = pdfParser.getRawTextContent(filename)
// console.log(txt)
// getRawTextContent
}
function t3()
{
let pdfParser = new PDFParser();
pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError));
pdfParser.on("pdfParser_dataReady", pdfData => {
console.log(JSON.stringify(pdfData))
fs.writeFileSync('/tmp/gg.json', JSON.stringify(pdfData))
// console.log(pdfData.formImage.Pages)
// fs.writeFile("/tmp/gg.json", JSON.stringify(pdfData));
});
pdfParser.loadPDF(filename);
}
function t2()
{
// default render callback
function render_page(pageData)
{
//check documents https://mozilla.github.io/pdf.js/
let render_options = {
//replaces all occurrences of whitespace with standard spaces (0x20). The default value is `false`.
normalizeWhitespace: false,
//do not attempt to combine same line TextItem's. The default value is `false`.
disableCombineTextItems: false
}
return pageData.getTextContent(render_options)
.then(function(textContent){
let lastY, text = '';
for(let item of textContent.items)
{
if(lastY == item.transform[5] || !lastY)
{
text += item.str;
}
else
{
text += '\n' + item.str;
}
lastY = item.transform[5];
}
return text;
});
}
let options = {
pagerender: render_page
}
let dataBuffer = fs.readFileSync(filename);
pdf(dataBuffer, options).then(function(data){
//use new format
console.log(data, data.text)
});
}
function t1()
{
let dataBuffer = fs.readFileSync(filename);
pdf(dataBuffer).then(function(data){
// number of pages
console.log(data.numpages);
// number of rendered pages
console.log(data.numrender);
// PDF info
console.log(data.info);
// PDF metadata
console.log(data.metadata);
// PDF.js version
// check https://mozilla.github.io/pdf.js/getting_started/
console.log(data.version);
// PDF text
console.log(data.text);
});
}
// t1()
// t2()
// t3()
t4()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment