Skip to content

Instantly share code, notes, and snippets.

@emptymonkey
Forked from arimai/dictionary.js
Created January 25, 2020 20:57
Show Gist options
  • Save emptymonkey/51dffd423ffe7299d65ed385dfbf2778 to your computer and use it in GitHub Desktop.
Save emptymonkey/51dffd423ffe7299d65ed385dfbf2778 to your computer and use it in GitHub Desktop.
Read and parse files using streams in node.js to build a dictionary object mapping word=>frequency.
const fs = require('fs');
const Parse = require('./parser');
//constructor
function CreateDictionary(){
this.dictionary = {};
}
// reads files from directory, wait for files to get parsed and then return dictionary
CreateDictionary.prototype.parseDirectory = function(dirpath){
return new Promise((resolve,reject) => {
return fs.readdir(dirpath,(err,files) => {
if(err) reject(err);
else{
const fns = files
.filter((file)=> /.*\.txt$/g.test(file))
.map((file) => this.parseFile(dirpath+'/'+file));
return Promise.all([...fns]).then((values,err) => {
if(err) reject(err);
console.log(`parsing complete.`);
resolve(this.dictionary);
});
}
});
});
}
//read a single file, wait for it to get parsed and return dictionary
CreateDictionary.prototype.parseFile = function(filepath){
return new Promise((resolve,reject) => {
const read = fs.createReadStream(filepath, {encoding: 'utf-8'});
const parser = new Parse(this.dictionary);
console.log(`reading file ${filepath}`);
read.pipe(parser)
.on('error',(err) => reject(err))
.on('finish',() => {
console.log(`finished parsing file ${filepath}`);
resolve(this.dictionary);
});
});
}
module.exports = CreateDictionary;
const Transform = require('stream').Transform;
/** returns a transform stream object that expects chunk of data from a read stream,
* converts the chunk into string,
* cleans the string of any unwanted characters,
* splits the string into an array of words
* if word is present in the dictionary increase frequency of the word, else initialize frequency to 1
**/
function Parse (dictionary) {
const parser = new Transform();
parser._transform = (data, encoding, done) => {
cleanString(data.toString())
.split(/[\n]/g)
.filter((line) => line!=='')
.map((elem) => {
elem = elem.toLowerCase();
dictionary[elem] = (!dictionary[elem]) ? 1 : dictionary[elem]+1 ;
});
done();
}
return parser;
}
function cleanString(str){
str = str.replace(/[^a-zA-Z]/g,'\n');
return str;
}
module.exports = Parse;
const CreateDictionary = require('./dictionary.js');
const dict = new CreateDictionary();
//test parseDirectory
dict.parseDirectory('some-folder-path')
.then((dictionary) =>
console.log(`
number of dictionary elements : ${Object.keys(dictionary).length}
dictionary: ${JSON.stringify(dictionary,null,2)}
`)
).catch((err) => console.log(err.message));
//test parseFile
dict.parseFile('some-file-path')
.then((dictionary) =>
console.log(`
number of dictionary elements : ${Object.keys(dictionary).length}
dictionary: ${JSON.stringify(dictionary,null,2)}
`)
).catch((err) => console.log(err.message));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment