Created
April 13, 2015 09:08
-
-
Save MarZab/176fad509fbd4a8babc8 to your computer and use it in GitHub Desktop.
JavaScript Implementation of N-Gram Text Categorisation based on paper by William B. Cavnar and John M. Trenkle
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'use strict'; | |
/* | |
N-Gram-Based Text Categorisation | |
Implementation by [email protected] | |
28.3.2015 All rights reserved. | |
Based on the paper by: | |
William B. Cavnar and John M. Trenkle | |
Environmental Research Institute of Michigan | |
P.O. Box 134001 | |
Ann Arbor MI 48113-4001 | |
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.42.4093&rep=rep1&type=pdf | |
Needs (npm install): | |
IO.JS, for ECMAScript 6 - https://iojs.org/en/index.html | |
Collections, for SortedSet - http://www.collectionsjs.com/ | |
Console.table, for lovely table output - https://github.com/bahmutov/console.table | |
*/ | |
var SortedSet = require("collections/sorted-set"); | |
require('console.table'); | |
var fs = require('fs'); | |
var async = require('async'); | |
/** | |
* Create Map of word occurances from file | |
* @param file | |
* @param callback | |
*/ | |
exports.readWords = function (file, debug, callback) { | |
console.log(debug + ' - Reading'); | |
// open file stream | |
var stream = fs.createReadStream(file, { | |
flags: 'r', encoding: 'utf-8' | |
}); | |
stream.on('open', function () { | |
console.time(debug + ' - Reading time'); // time word reading | |
}); | |
// Regex to extract text | |
var wordsOnly = /[\w']+/g; | |
//var wordsOnly = /[a-zA-Z']+/g; // wayyyy faster | |
var words = new Map(); | |
stream.on('data', function (chunk) { | |
// We might have cut off a word here, | |
// but chunk size makes the mistake negligible. | |
chunk.replace(wordsOnly, function (word) { | |
// We get a word here, store and count it | |
var e = words.get(word); | |
if (e) { | |
words.set(word, 1 + e); | |
} else { | |
words.set(word, 1); | |
} | |
}); | |
}); | |
stream.on('error', function (err) { | |
console.log(debug + ' - ', err); | |
callback(); | |
}); | |
stream.on('end', function () { | |
console.log(debug + ' - Read ' + words.size + ' words'); | |
console.timeEnd(debug + ' - Reading time'); | |
callback(words); | |
}); | |
}; | |
/** | |
* Create list of n-grams from Map of word occurances | |
* @param words = MAP(Word=>Count) | |
*/ | |
exports.nGram = function (words, debug) { | |
/* | |
2.0 N-Grams | |
An N-gram is an N-character slice of a longer | |
string. Although in the literature the term can | |
include the notion of any co-occurring set of | |
characters in a string (e.g., an N-gram made up | |
of the first and third character of a word), in this | |
paper we use the term for contiguous slices only. | |
Typically, one slices the string into a set of overlapping | |
N-grams. In our system, we use N-grams | |
of several different lengths simultaneously. We | |
also append blanks to the beginning and ending | |
of the string in order to help with matching | |
beginning-of-word and ending-of-word situations. | |
(We will use the underscore character (“_”) | |
to represent blanks.) | |
*/ | |
var N = [1, 2, 3, 4, 5]; | |
var nGrams = new Map(); | |
console.time(debug + ' - Creating and counting n-grams'); | |
words.forEach(function (times, word) { | |
var prefixedWord = '_' + word; | |
N.forEach(function (n) { | |
var paddedWord = prefixedWord + Array(n).join('_'); | |
for (var i = paddedWord.length - n; i >= 0; i--) { | |
// we got an n-gram | |
var nGram = paddedWord.substr(i, n); | |
var exists = nGrams.get(nGram); | |
if (exists) { | |
nGrams.set(nGram, exists + times); | |
} else { | |
nGrams.set(nGram, times); | |
} | |
} | |
}); | |
}); | |
console.timeEnd(debug + ' - Creating and counting n-grams'); | |
console.log(debug + ' - Got ' + nGrams.size + ' unique n-grams'); | |
console.time(debug + ' - Weighting n-grams'); | |
/* | |
3.1.5 Sort those counts into reverse order by the | |
number of occurrences. Keep just the Ngrams | |
themselves, which are now in | |
reverse order of frequency. | |
We divert from the paper at this point, we care that ngrams | |
with the same weight get the same "distance". | |
We got a map of ('ngram' => count) | |
put the values (count) into a sorted set | |
then set the values of count to the index in the set | |
*/ | |
var occurrences = new SortedSet(); | |
for (var v of nGrams.values()) { | |
occurrences.add(v); | |
} | |
var occurrencesLength = occurrences.length; | |
for (var n of nGrams) { | |
nGrams.set(n[0], occurrencesLength - occurrences.indexOf(n[1])); | |
} | |
console.timeEnd(debug + ' - Weighting n-grams'); // this was slow... gotta be a better way .. anywho | |
/* | |
We return a Map of n-grams with their respective weight | |
Save the max weight for faster comparison later on | |
*/ | |
nGrams.max = occurrencesLength; | |
nGrams.name = debug; | |
return nGrams; | |
}; | |
/** | |
* Compare 2 n-grams | |
* @param nGram1 | |
* @param nGram2 | |
*/ | |
exports.compareNGrams = function (trainedNGram, comperativeNgram) { | |
/* | |
3.2 Comparing and Ranking N-Gram Frequency Profiles | |
For each N-gram in the document profile, we | |
find its counterpart in the category profile, and | |
then calculate how far out of place it is. For | |
example, in Figure 3, the N-gram “ING” is at | |
rank 2 in the document, but at rank 5 in the category. | |
Thus it is 3 ranks out of place. If an N-gram | |
(such as “ED” in the figure) is not in the category | |
profile, it takes some maximum out-of-place | |
value. The sum of all of the out-of-place values | |
for all N-grams is the distance measure for the | |
document from the category. | |
Maximum out ouf place number was not clearly stated in the paper. | |
For best performace we picked one more the max oop from the test set. | |
*/ | |
var maxoutOfPlace = comperativeNgram.max +1; | |
var sum = 0; | |
var undef = 0; | |
console.log('Comparing', trainedNGram.name, 'with', comperativeNgram.name, maxoutOfPlace); | |
for (var cnGram of comperativeNgram) { | |
var tnGram = trainedNGram.get(cnGram[0]); | |
// is this nGram in the trained set? | |
if (tnGram !== undefined && tnGram < 400) { | |
// cut the | |
sum += Math.abs(cnGram[1]-tnGram); | |
} else { | |
undef ++; | |
sum += maxoutOfPlace; | |
} | |
} | |
return { | |
'test set': trainedNGram.name, | |
//compared: comperativeNgram.name, | |
'hard misses': undef / comperativeNgram.size, | |
distance: sum, | |
averageOOP: ( sum + undef * maxoutOfPlace ) / comperativeNgram.size | |
}; | |
}; | |
exports.getNGramFromFile = function (filename, debug, callback) { | |
exports.readWords(filename, debug, function (words) { | |
if (words) { | |
return callback(exports.nGram(words, debug)); | |
} | |
callback(); | |
}); | |
}; | |
// test | |
var test = 'test/sl.test'; | |
var train = ['test/bg_train.txt', 'test/en_train.txt', 'test/cz_train.txt', 'test/es_train.txt', 'test/train.sl']; | |
exports.getNGramFromFile(test, test, function (nGram) { | |
async.mapSeries( | |
train, | |
function (file, callback) { | |
exports.getNGramFromFile(file, file, function (trainedNGram) { | |
/* | |
4.3 ... we also varied | |
the number of the N-gram frequencies available | |
in the profile for the match, by limiting it to | |
statistics for 100, 200, 300 or 400 N-grams. This | |
variable did have an impact on match performance, | |
although by the 400 N-gram level language | |
classification was almost perfect. | |
*/ | |
callback(null, exports.compareNGrams(trainedNGram, nGram)); | |
}); | |
}, | |
function (err, results) { | |
console.log('\nResults for', nGram.name); | |
results.sort(function (r1, r2) { | |
return r1.distance - r2.distance; | |
}); | |
console.table(results); | |
console.log('Done.'); | |
} | |
); | |
}); | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "n-grams", | |
"version": "0.0.1", | |
"private": true, | |
"dependencies": { | |
"async": "^0.9.0", | |
"collections": "^1.2.2", | |
"console.table": "^0.4.0" | |
}, | |
"engines": { | |
"iojs": ">= 1.6.2" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment