Skip to content

Instantly share code, notes, and snippets.

@stekhn
Created January 3, 2017 10:49
Show Gist options
  • Save stekhn/04711e12425b0aa10c0502ae8ca8e661 to your computer and use it in GitHub Desktop.
Save stekhn/04711e12425b0aa10c0502ae8ca8e661 to your computer and use it in GitHub Desktop.
Extract text from PDF files (with images) using Node.js
// Extract text from PDF files (with images)
// Installation guide: https://github.com/nisaacson/pdf-extract
var extract = (function() {
'use strict';
var fs = require('fs');
var path = require('path');
var pdfExtract = require('pdf-extract');
var defaultOptions = {
type: 'ocr',
ocr_flags: [
'-l eng',
]
};
// Execute script if not used as a module
if (!module.parent) {
init(process.argv[2]);
}
function init(filePath, options, callback) {
callback = callback || function (error, response) {
if (error) { return console.error(error); }
return console.log(response);
};
options = options || defaultOptions;
if (!filePath) {
return callback(new Error('No input file (PDF) specified.'));
}
processFile(filePath, ocrLanguage, callback);
}
function processFile(filePath, ocrLanguage, callback) {
var processor = pdfExtract(filePath, options, function (error) {
if (error) {
callback(error);
}
});
processor.on('complete', function (data) {
saveFile(filePath + '.txt', data.text_pages, callback);
});
processor.on('error', function (error) {
callback(error);
});
}
function saveFile(filePath, string, callback) {
// Normalize file path
filePath = path.normalize(filePath);
try {
callback('Saved file ' + filePath);
// Save file
return fs.writeFileSync(filePath, string, 'utf8');
} catch (error) {
callback(error);
}
}
module.exports = {
init: init
};
}());
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment