Skip to content

Instantly share code, notes, and snippets.

@MarZab
Created April 13, 2015 09:08
Show Gist options
  • Save MarZab/176fad509fbd4a8babc8 to your computer and use it in GitHub Desktop.
Save MarZab/176fad509fbd4a8babc8 to your computer and use it in GitHub Desktop.
JavaScript Implementation of N-Gram Text Categorisation based on paper by William B. Cavnar and John M. Trenkle
'use strict';
/*
N-Gram-Based Text Categorisation
Implementation by [email protected]
28.3.2015 All rights reserved.
Based on the paper by:
William B. Cavnar and John M. Trenkle
Environmental Research Institute of Michigan
P.O. Box 134001
Ann Arbor MI 48113-4001
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.42.4093&rep=rep1&type=pdf
Needs (npm install):
IO.JS, for ECMAScript 6 - https://iojs.org/en/index.html
Collections, for SortedSet - http://www.collectionsjs.com/
Console.table, for lovely table output - https://github.com/bahmutov/console.table
*/
var SortedSet = require("collections/sorted-set");
require('console.table');
var fs = require('fs');
var async = require('async');
/**
* Create Map of word occurances from file
* @param file
* @param callback
*/
exports.readWords = function (file, debug, callback) {
console.log(debug + ' - Reading');
// open file stream
var stream = fs.createReadStream(file, {
flags: 'r', encoding: 'utf-8'
});
stream.on('open', function () {
console.time(debug + ' - Reading time'); // time word reading
});
// Regex to extract text
var wordsOnly = /[\w']+/g;
//var wordsOnly = /[a-zA-Z']+/g; // wayyyy faster
var words = new Map();
stream.on('data', function (chunk) {
// We might have cut off a word here,
// but chunk size makes the mistake negligible.
chunk.replace(wordsOnly, function (word) {
// We get a word here, store and count it
var e = words.get(word);
if (e) {
words.set(word, 1 + e);
} else {
words.set(word, 1);
}
});
});
stream.on('error', function (err) {
console.log(debug + ' - ', err);
callback();
});
stream.on('end', function () {
console.log(debug + ' - Read ' + words.size + ' words');
console.timeEnd(debug + ' - Reading time');
callback(words);
});
};
/**
* Create list of n-grams from Map of word occurances
* @param words = MAP(Word=>Count)
*/
exports.nGram = function (words, debug) {
/*
2.0 N-Grams
An N-gram is an N-character slice of a longer
string. Although in the literature the term can
include the notion of any co-occurring set of
characters in a string (e.g., an N-gram made up
of the first and third character of a word), in this
paper we use the term for contiguous slices only.
Typically, one slices the string into a set of overlapping
N-grams. In our system, we use N-grams
of several different lengths simultaneously. We
also append blanks to the beginning and ending
of the string in order to help with matching
beginning-of-word and ending-of-word situations.
(We will use the underscore character (“_”)
to represent blanks.)
*/
var N = [1, 2, 3, 4, 5];
var nGrams = new Map();
console.time(debug + ' - Creating and counting n-grams');
words.forEach(function (times, word) {
var prefixedWord = '_' + word;
N.forEach(function (n) {
var paddedWord = prefixedWord + Array(n).join('_');
for (var i = paddedWord.length - n; i >= 0; i--) {
// we got an n-gram
var nGram = paddedWord.substr(i, n);
var exists = nGrams.get(nGram);
if (exists) {
nGrams.set(nGram, exists + times);
} else {
nGrams.set(nGram, times);
}
}
});
});
console.timeEnd(debug + ' - Creating and counting n-grams');
console.log(debug + ' - Got ' + nGrams.size + ' unique n-grams');
console.time(debug + ' - Weighting n-grams');
/*
3.1.5 Sort those counts into reverse order by the
number of occurrences. Keep just the Ngrams
themselves, which are now in
reverse order of frequency.
We divert from the paper at this point, we care that ngrams
with the same weight get the same "distance".
We got a map of ('ngram' => count)
put the values (count) into a sorted set
then set the values of count to the index in the set
*/
var occurrences = new SortedSet();
for (var v of nGrams.values()) {
occurrences.add(v);
}
var occurrencesLength = occurrences.length;
for (var n of nGrams) {
nGrams.set(n[0], occurrencesLength - occurrences.indexOf(n[1]));
}
console.timeEnd(debug + ' - Weighting n-grams'); // this was slow... gotta be a better way .. anywho
/*
We return a Map of n-grams with their respective weight
Save the max weight for faster comparison later on
*/
nGrams.max = occurrencesLength;
nGrams.name = debug;
return nGrams;
};
/**
* Compare 2 n-grams
* @param nGram1
* @param nGram2
*/
exports.compareNGrams = function (trainedNGram, comperativeNgram) {
/*
3.2 Comparing and Ranking N-Gram Frequency Profiles
For each N-gram in the document profile, we
find its counterpart in the category profile, and
then calculate how far out of place it is. For
example, in Figure 3, the N-gram “ING” is at
rank 2 in the document, but at rank 5 in the category.
Thus it is 3 ranks out of place. If an N-gram
(such as “ED” in the figure) is not in the category
profile, it takes some maximum out-of-place
value. The sum of all of the out-of-place values
for all N-grams is the distance measure for the
document from the category.
Maximum out ouf place number was not clearly stated in the paper.
For best performace we picked one more the max oop from the test set.
*/
var maxoutOfPlace = comperativeNgram.max +1;
var sum = 0;
var undef = 0;
console.log('Comparing', trainedNGram.name, 'with', comperativeNgram.name, maxoutOfPlace);
for (var cnGram of comperativeNgram) {
var tnGram = trainedNGram.get(cnGram[0]);
// is this nGram in the trained set?
if (tnGram !== undefined && tnGram < 400) {
// cut the
sum += Math.abs(cnGram[1]-tnGram);
} else {
undef ++;
sum += maxoutOfPlace;
}
}
return {
'test set': trainedNGram.name,
//compared: comperativeNgram.name,
'hard misses': undef / comperativeNgram.size,
distance: sum,
averageOOP: ( sum + undef * maxoutOfPlace ) / comperativeNgram.size
};
};
exports.getNGramFromFile = function (filename, debug, callback) {
exports.readWords(filename, debug, function (words) {
if (words) {
return callback(exports.nGram(words, debug));
}
callback();
});
};
// test
var test = 'test/sl.test';
var train = ['test/bg_train.txt', 'test/en_train.txt', 'test/cz_train.txt', 'test/es_train.txt', 'test/train.sl'];
exports.getNGramFromFile(test, test, function (nGram) {
async.mapSeries(
train,
function (file, callback) {
exports.getNGramFromFile(file, file, function (trainedNGram) {
/*
4.3 ... we also varied
the number of the N-gram frequencies available
in the profile for the match, by limiting it to
statistics for 100, 200, 300 or 400 N-grams. This
variable did have an impact on match performance,
although by the 400 N-gram level language
classification was almost perfect.
*/
callback(null, exports.compareNGrams(trainedNGram, nGram));
});
},
function (err, results) {
console.log('\nResults for', nGram.name);
results.sort(function (r1, r2) {
return r1.distance - r2.distance;
});
console.table(results);
console.log('Done.');
}
);
});
{
"name": "n-grams",
"version": "0.0.1",
"private": true,
"dependencies": {
"async": "^0.9.0",
"collections": "^1.2.2",
"console.table": "^0.4.0"
},
"engines": {
"iojs": ">= 1.6.2"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment