redaktor · February 12, 2024 23:18
diff --git a/blacklist.js b/blacklist.js
 var blacklist = {
 weaks: [ 
 	"be", 
 	"am", 
 	"is", 
 	"are", 
 	"wa", 
 	"were", 
 	"been", 
 	"have", 
 	"do", 
 	"say", 
 	"go",  
 	"see", 
 	"give", 
 	"know", 
 	"want", 
 	"put", 
 	"seem", 
 	"stay", 
 	"speak", 
 	"find", 
 	"come", 
 	"think", 
 	"leav", 
 	"take",  
 	"feel", 
 	"watch", 
 	"begin", 
 	"hope", 
 	"exist", 
 	"work", 
 	"produc", 
 	"occur", 
 	"understand", 
 	"receiv", 
 	"appear", 
 	"serv", 
 	"need", 
 	"maintain", 
 	"chang", 
 	"introduc", 
 	"creat", 
 	"open", 
 	"consider", 
 	"hear", 
 	"finish", 
 	"convert", 
 	"form", 
 	"bring",  
 	"achiev", 
 	"suppos", 
 	"get", 
 	"got", 
 	"reach", 
 	"run", 
 	"ran", 
 	"use", 
 	"help", 
 	"show", 
 	"move", 
 	"happen", 
 	"fix", 
 	"set"
 ],
 fillers: [
 	"absolutely", 
 	"actual", 
 	"actually", 
 	"anyway", 
 	"apparently", 
 	"approximately", 
 	"badly", 
 	"basically", 
 	"begin", 
 	"certainly", 
 	"clearly", 
 	"completely", 
 	"definitely", 
 	"easily", 
 	"effectively", 
 	"entirely", 
 	"especially", 
 	"essentially", 
 	"exactly", 
 	"extremely", 
 	"fairly", 
 	"frankly", 
 	"frequently", 
 	"fully", 
 	"generally", 
 	"hardly", 
 	"heavily", 
 	"highly", 
 	"hopefully", 
 	"just", 
 	"largely", 
 	"like", 
 	"literally", 
 	"maybe", 
 	"might", 
 	"most", 
 	"mostly", 
 	"much", 
 	"necessarily", 
 	"nicely", 
 	"obviously", 
 	"ok", 
 	"okay", 
 	"particularly", 
 	"perhaps", 
 	"possibly", 
 	"practically", 
 	"primarily", 
 	"probably", 
 	"precisely", 
 	"quite", 
 	"rather", 
 	"real", 
 	"really", 
 	"relatively", 
 	"right", 
 	"seriously", 
 	"significantly", 
 	"simply", 
 	"slightly", 
 	"so", 
 	"specifically", 
 	"start", 
 	"strongly", 
 	"surely", 
 	"too", 
 	"totally", 
 	"truly", 
 	"try", 
 	"typically", 
 	"ultimately", 
 	"usually", 
 	"very", 
 	"virtually", 
 	"whatever", 
 	"well", 
 	"whenever", 
 	"wherever", 
 	"whoever", 
 	"widely"
 ],
 vulgars: [
 	"anal", 
 	"anus", 
 	"arabush", 
 	"arse", 
 	"arsehole", 
 	"ass", 
 	"asshole", 
 	"ballsack", 
 	"balls", 
 	"bastard", 
 	"bitch", 
 	"biatch", 
 	"bloody", 
 	"blowjob", 
 	"blow job", 
 	"bluegum", 
 	"bollock", 
 	"bollok", 
 	"boner", 
 	"boob", 
 	"bugger", 
 	"bum", 
 	"butt", 
 	"buttcrack", 
 	"buttplug", 
 	"chinaman", 
 	"clit", 
 	"clitoris", 
 	"cock", 
 	"cocksucker", 
 	"coon", 
 	"crap", 
 	"cunt", 
 	"damn", 
 	"dick", 
 	"dickhead", 
 	"dildo", 
 	"dyke", 
 	"fag", 
 	"feck", 
 	"fellate", 
 	"fellatio", 
 	"felching", 
 	"fuck", 
 	"fuckhead", 
 	"f u c k", 
 	"fudgepacker", 
 	"fudge packer", 
 	"flange", 
 	"goddamn", 
 	"gable", 
 	"god damn", 
 	"handjob", 
 	"hell", 
 	"homo", 
 	"jerk", 
 	"jizz", 
 	"knobend", 
 	"knob end", 
 	"labia", 
 	"lmao", 
 	"lmfao", 
 	"muff", 
 	"nigger", 
 	"nigga", 
 	"niggar", 
 	"omg", 
 	"penis", 
 	"piss", 
 	"poop", 
 	"prick", 
 	"pube", 
 	"pussy", 
 	"queer", 
 	"scrotum", 
 	"shit", 
 	"s hit", 
 	"sh1t", 
 	"slut", 
 	"smegma", 
 	"spunk", 
 	"sucker",
 	"tit", 
 	"tosser", 
 	"turd", 
 	"twat", 
 	"vagina", 
 	"wank", 
 	"whore", 
 	"wtf"
 ]
 };

 var main = {};
 main.weak = new RegExp( '^'.concat(blacklist.weaks.join('|^')), 'gi' );
 main.filler = new RegExp( '^'.concat(blacklist.fillers.join('$|^'), '$'), 'gi' );
 main.vulgar = new RegExp( '^'.concat(blacklist.vulgars.join('|^')), 'gi' );

 if (typeof module !== "undefined" && module.exports) {
 	module.exports = main;
 }
diff --git a/EXAMPLE.js b/EXAMPLE.js
 // TODO - make logic_negate and abbreviations to lexicon as resource file (i18n, language aware, seperate data and logic)
 // the best way might be a dictionary with flags where we can easily derive the lexicon by Object.keys and map, like
 /* dictionary: {
 	"CP": [
 		{v:'is', weak: 1},
 		...
 	],
 	...
 };
 */
 var nlp = require('nlp_compromise');
 var util = require('util');

 //var TEST = 'The cats we saw, e.g. tigers or leopards, are nice. I am a perfect second sentence for them. This is actually not. We\'re exclamative! Let us look back. They were beaten.';
 var TEST = 'He was told that they have been hardly wounded.'
 /* TODO - options, like "optimize metrics for"
 // example:
 // TICKS, e.g. LONGSENTENCE <:
 news / mobile 		18
 story / desktop 	25
 longread 			30
 scientific text 	45
 */

 /*
 TODO - use important rules from the stylebooks of AP, APA (en) and dpa (de)

 e.g.:

 + (related to dates) : ages:
 For ages, always use figures. If the age is used as an adjective or as a substitute for a noun, then it should be hyphenated. Don't use apostrophes when describing an age range. 
 Examples: A 21-year-old student. The student is 21 years old. The girl, 8, has a brother, 11. The contest is for 18-year-olds. He is in his 20s.


 Please note, that medical and political titles only need to be used on first reference when they appear outside of a direct quote. 
 For courtesy titles, use these on second reference or when specifically requested. 
 Other acronyms and abbreviations are acceptable but not required (i.e. FBI, CIA, GOP). The context should govern such decisions. Avoid "alphapet soup" ...

 Use quotation marks around the titles of books, songs, television shows, computer games, poems, lectures, speeches and works of art. 
 Examples: Author Porter Shreve read from his new book, "When the White House Was Ours." They sang "The Star-Spangled Banner" before the game.
 Do not use quotations around the names of magazine, newspapers, the Bible or books that are catalogues of reference materials. 
 Examples: The Washington Post first reported the story. He reads the Bible every morning.

 When used with a date, abbreviate only the following months: Jan., Feb., Aug., Sept., Oct., Nov. and Dec.
 */
 var c = {
 	LONGSENTENCE: 40,
 	SHORTSENTENCE: 5,
 };

 // TODO FIXME - should go to lexicon
 /*
 NOTE: better performance when we use the following additional tagging already when stemming:
 'AUX':  'auxillary verbs'
 'WDT':  'wh-determiner',		// WHICH, WHAT, WHOSE
 'WP':   'wh-pronoun', 			// WHICH, WHAT, WHO, WHOM
 'WRB':  'wh-adverb',			// HOW, WHEN, WHENCE, WHERE, WHY	

 'TO':   'to', // ?
 'RP':   'Particle',				// it would be useful if there is RPP for positive particles and RPN for negative
 								// and if there would be an "opposite" mapping ...
 								// note currently only "not" is handled and it stems as a "CC"

 'LS':   'List item marker',           
 'PDT':  'Predeterminer',              
 'POS':  'Possessive ending', 
 'SYM':  'Symbol (mathematical or scientific)',
 ':':    'colon',         
 '(':    'open parenthesis',           
 '``':   'open quote',                 
 "''":   'close quote',                
 '#':    'pound sign (currency marker)',
 '$':    'dollar sign (currency marker)',
 ')':    'close parenthesis',
 ',':    'comma',
 '.':    'period'

 // ?
 'WP$':  'Possessive wh-pronoun', // how about demonstrativePronouns ?
 */
 // auxillary verbs
 var auxVerbs = ['do', 'does', 'did', 'have', 'has', 'had', 'having', 'be', 'is', 'am', 'are', 'was', 'were', 'been', 'being', 'shall', 'will', 'should', 'would', 'can', 'could', 'may', 'might', 'must'];

 // auxillary verbs and other verbs in verb groups;
 var verbGroups = [
 	// first item is already known as any verb or auxVerb
 	
 	// TODO better: pos_reason VB verb ed
 	{
 		aux: ['have', 'has', 'had', 'having'],
 		verbs: /(en$)|(ed$)/
 	},
 	{
 		aux: ['is', 'am', 'are', 'was', 'were', 'been', 'be', 'being', 'to be'],
 		verbs: /ing$/
 	},
 	{
 		aux: ['is', 'am', 'are', 'was', 'were', 'been', 'be', 'to be'],
 		verbs: /(en$)|(ed$)/
 	}
 	// last item SHOULD be a verb except auxVerbs or 'copula-adjective' - TODO - How to express in lexicon ?
 ];
 // passive voice
 var passiveVoiceAux = ["am", "is", "are", "was", "were", "be", "been", "being"];

 // subset of determiners
 var demonstrativePronouns = ['this', 'that', 'these', 'those', 'such', 'none', 'neither'];
 var specialDemonstrativePronouns = ['this', 'that'];
 var whDeterminers = ['which', 'what', 'whose'];
 // other wh-stuff, see http://www.garfixia.nl/k/news/view/1442/15/the-what-why-and-how-of-wh-words.html
 var whPronouns = ['which', 'what', 'who', 'whom'];
 var whAdverbs = ['how', 'when', 'whence', 'where', 'why'];
 // entity substitutions
 var entitySubstitutions = ['it', 'he', 'him', 'she', 'her', 'i', 'me', 'we', 'us', 'they', 'them', 'you', 'there', 'here', 'thing', 'stuff', 'fact', 'this', 'that'];

 // nominalizations
 var nominalizationRe = new RegExp('(?:ion|ions|ism|isms|ty|ties|ment|ments|ness|nesses|ance|ances|ence|ences)$');

 // end ^ TODO FIXME - should go to lexicon



 // EXTEND ARRAY PROTOTYPE
 Array.prototype.average = function() {
 	// TODO - in other contexts we MUST handle values other than typeof 'number' !!!
 	var r = {mean: 0, variance: 0, deviation: 0}, t = this.length;
 	for(var m, s = 0, l = t; l--; s += this[l]);
 	for(m = r.mean = s / t, l = t, s = 0; l--; s += Math.pow(this[l] - m, 2));
 	return r.deviation = Math.sqrt(r.variance = s / t), r;
 };
 Array.prototype.unique = function() {
 	return this.reduce(function(p, c) {
 		if (p.indexOf(c) < 0) p.push(c);
 		return p;
 	}, []);
 };
 Array.prototype.sequences = function() {
    var lastI = -1;
    var results = [[]];
    var that = this;
    this.forEach(function(i, j) {
        if (i != lastI+1 && lastI>-1) results.push([]);
        results[results.length - 1].push(i);
        lastI = i;
    });
    return results;
 };

 function decimals(f, dec) {
 	// TODO, v2
 	// stub, currently used for toPercent which will become readable with 
 	// decimals rounded and percent values and value/unit etc.
 	// we SHOULD round the 2nd decimal ...
 	if (!dec) dec = 2;
 	return parseFloat(f.toFixed(2));
 }
 function calculateMetrics(txt) {
 	var processed = nlp.pos(txt);
 	var metrics = {
 		sentenceCount: 0,
 		wordCount: 0,
 		characterCount: 0,
 		characterCountTrimmed: 0,
 		uselessBoundaries: 0,
 		vocabularySize: 0,
 		wordsPerSentence: 0,
 		wordsPerSentenceStd: -1,
 		longSentencesRatio: 0,
 		shortSentencesRatio: 0,
 		declarativeRatio: 0,
 		interrogativeRatio: 0,
 		exclamativeRatio: 0,
 		charactersPerWords: 0,
 		syllablesPerWord: 0, 
 		negationsPerSentence: 0,
 		stopwordRatio: 0, 
 		nounRatio: 0,
 		nounClusterRatio: 0,
        pronounRatio: 0,
        verbRatio: 0,
        adjectiveRatio: 0,
        adverbRatio: 0,
        otherPosRatio: 0,
 		modalRatio: 0,
 		nominalizationRatio: 0,
 		entitySubstitutionRatio: 0,
 		weakVerbRatio: 0,
 		vulgarWordRatio: 0,
 		verbGroupsPerSentence: 0,
 		passiveVoicePerSentence: 0,
 		fillerRatio: 0,
 		readability: 0
 	};
 	var sentences = processed.sentences;
 	
 	// count number of sentences
 	// sentenceCount
 	metrics.sentenceCount = sentences.length;
 	var stems = [];
 	var sentencesCounts = [];
 	var charactersPerWordsCounts = [];
 	var syllablesCount = 0;
 	var negationsCount = 0;
 	// depends on other nouns
 	var nounClusterCount = 0;
 	// depends on wordCount 
 	var tCounts = {
 		noun: 0,
 		pronoun: 0,
 		pronounNonpossesive: 0,
 		verb: 0,
 		adverb: 0,
 		adjective: 0,
 		modalVerb: 0,
 		weakVerb: 0,
 		vulgarWord: 0,
 		filler: 0
 	};
 	
 	
 	// question: we have 1 minor issue with the TAGS:
 	// "CP" is a copula verb but a verb. We think it is e.g. different from noun/pronoun relation - SHOULD it be called VCP ???
 	var _types = { N: 'noun', P: 'pronoun', V: 'verb', C: 'verb', R: 'adverb', J: 'adjective', M: 'modalVerb' };
 	
 	// for further calculation purposes
 	var data = {
 		nominalizations: [],
 		entitySubstitutions:[]
 	};
 	
 	var nounCluster = function(token, _nounsCount) {
 		if (!_nounsCount || _nounsCount < 1) _nounsCount = token.normalised.match(/\S+/g).length;
 		// count clustered nouns (3 with possibly 'of')
 		var n = token.analysis.next;
 		if (n && _nounsCount < 10 && (n.pos.tag.slice(0,1) === 'N' || n.normalised === 'of')) {
 			if (n.normalised != 'of') _nounsCount++;
 			nounCluster(token, _nounsCount);
 		} else if (_nounsCount > 2) {
 			return _nounsCount;
 		} else {
 			return 0;	
 		}
 	}
 	
 	var verbGroupBegin = function(o) {
 		return (o.hasOwnProperty('pos') && o.analysis.next && (o.pos.parent === 'verb' || auxVerbs.indexOf(o.normalised) > -1));
 	}
 	var verbGroupEnd = function(o) {
 		return (o.hasOwnProperty('pos') && (o.pos.parent === 'verb' && auxVerbs.indexOf(o.normalised) < 0) || o.pos_reason === 'copula-adjective');
 	}
 	
 	
 	sentences.forEach(function(sentence, sI) {
 		//var sText = sentence.text();
 		console.log( '!s', sentence.text() );
 		
 		sentences[sI].metrics = {};
 		// count number of words
 		// wordCount
 		data.nominalizations[sI] = [];
 		data.entitySubstitutions[sI] = [];
 		if (!(sentences[sI].hasOwnProperty('groupTokens'))) sentences[sI].metrics.groupTokens = [];
 				
 		metrics.wordCount = metrics.wordCount+sentence.tokens.length;
 		
 		// count verb groups
 		// handled rule group id and last group token
 		var l = 0;
 		var groupId = 0;
 		var last = {i:0};
 		var missingEnd = false;
 		stems = stems.concat(sentence.tokens.map(function(token, i){ 
 		
 			if (!(sentences[sI].metrics.groupTokens.length)) sentences[sI].metrics.groupTokens.push([]);
 			l = (sentences[sI].metrics.groupTokens.length);
 			
 			// count verb groups
 			if ( (!(last.i) || last.i < i) && verbGroupBegin(token)) {
 				// could be a normalized verb group
 				// note: does not cover phrasal verbs
 				var next = token.analysis.next;
 				var iNext = i+1;
 				verbGroups.every(function(group, gI) {
 					if (gI >= groupId) {
 						if ((group.aux.indexOf(next.normalised) > -1 || group.verbs.test(next.normalised) || next.pos_reason === 'copula-adjective')) {
 							
 							groupId = gI;
 							sentences[sI].metrics.groupTokens[l-1].push(i);
 							sentences[sI].metrics.groupTokens[l-1].push(iNext);
 							last = sentence.tokens[iNext];
 							last.i = iNext;
 							return false;
 						}
 					}	
 				});
 			} 
 			
 			console.log( last.i, i );
 			// seperate multiple verb groups TODO TEST - "special clusters"
 			l = (sentences[sI].metrics.groupTokens.length);
 			if (last.i === i && verbGroupEnd(token)) {
 				groupId = 0;
 				sentences[sI].metrics.groupTokens.push([]);
 			} else if (i > 0 && last.i != i && !verbGroupEnd(last)) {
 				console.log( 'hasEnd', verbGroupEnd(last), token.text );
 				if (verbGroupEnd(token)) {
 					console.log( 'Could be End: ', token.text );
 					sentences[sI].metrics.groupTokens[l-1].push(i);
 					groupId = 0;
 					sentences[sI].metrics.groupTokens.push([]);
 				}
 			}
 			
 			//console.log(token.pos.tag, token.normalised, token.pos_reason/*, token*/);
 			
 			// TODO - ISSUE with negation logic_negate just works in one direction FIXME CONTRIB
 			// test http://rawgit.com/spencermountain/nlp_compromise/master/client_side/basic_demo/index.html :
 			// example: joe never swims in the pool.
 			if (token.analysis.negative) negationsCount++;
 			//console.log( 'token: ', token );
 			
 			data.entitySubstitutions[sI][i] = (token.normalised != 'i' && (entitySubstitutions.indexOf(token.normalised) > -1) && !(token.capitalised));
 			if (data.entitySubstitutions[sI][i] && specialDemonstrativePronouns.indexOf(token.normalised) > -1) {
 				
 				if (token.analysis.last) {
 					var firsttwo = token.analysis.last.pos.tag.slice(0,2);
 					if (['NN', 'PR'].indexOf(firsttwo) > -1) data.entitySubstitutions[sI][i] = false;
 				}
 				if (token.analysis.next) {
 					var firsttwo = token.analysis.next.pos.tag.slice(0,2);
 					if (['NN', 'PR', 'JJ', 'DT'].indexOf(firsttwo) > -1) data.entitySubstitutions[sI][i] = false;
 					/*, 'WD', 'WP' // see above TODO, handled below*/
 					if (whDeterminers.concat(whPronouns).indexOf(token.normalised) > -1) data.entitySubstitutions[sI][i] = false;
 				}
 			}
 			if (data.entitySubstitutions[sI][i]) sentences[sI].tokens[i].metrics.entitySubstitution = true;
 			
 			
 			// count number of different parts of speech
 			var typeId = token.pos.tag.slice(0,1);
 			console.log( 'token: ', token.text, token.pos.tag, token.pos.parent, token.pos_reason );
 			//console.log( 'token3: ',typeId, _types[typeId] );
 			
 			if (_types.hasOwnProperty(typeId)) tCounts[_types[typeId]]++;
 			
 			// count characters per words
 			charactersPerWordsCounts.push(token.text.length);
 			
 			data.nominalizations[sI][i] = false;
 			if (typeId === 'N') {
 				// count clustered nouns
 				var curClusterCount = nounCluster(token);
 				if (curClusterCount) nounClusterCount += curClusterCount;
 				// count nominalizations
 				var isNNP = token.pos.tag.indexOf('NNP' === 0);
 				if (isNNP) data.nominalizations[sI][i] = (token.text.length > 7) && (token.normalised.search(nominalizationRe));
 			}
 			if (data.nominalizations[sI][i]) sentences[sI].tokens[i].metrics.nominalization = true;
 			
 			
 			if (typeId === 'V') {
 				// count weak verbs
 				var check = (token.pos.tense === 'present') ? token.normalised : token.analysis.conjugate().infinitive;
 				if (nlp.blacklist.weak.test(check)) tCounts.weakVerb++;
 			}
 			
 			// count vulgar words, fillers etc.
 			if (nlp.blacklist.vulgar.test(token.normalised)) tCounts.vulgarWord++;
 			if (nlp.blacklist.filler.test(token.normalised)) tCounts.filler++;
 			
 			var syllables = nlp.syllables(token.text);
 			if (syllables) syllablesCount = syllablesCount + syllables.length;
 			return token.normalised; 
 		})); 
 		
 		if (sentences[sI].metrics.groupTokens.length) sentences[sI].metrics.groupTokens = sentences[sI].metrics.groupTokens.filter(function(ts) {
 			return (ts.length);
 		});
 		
 		if (sentences[sI].metrics.groupTokens.length) {
 			// we found verb groups ...
 			var readableTokens = sentences[sI].metrics.groupTokens.map(function(ts) {
 				return ts.map(function(tId) { return sentences[sI].tokens[tId].normalised }).join(' ');
 			});
 			
 			sentences[sI].metrics.passiveVoiceTokens = [];
 			sentences[sI].metrics.groupTokens.forEach(function(ts, i) {
 				var isPassive = -1;
 				ts.forEach(function(tId) { 
 					sentences[sI].tokens[tId].verbGroup = i;
 					console.log( sentences[sI].tokens[tId].pos_reason );
 					if (passiveVoiceAux.indexOf(sentences[sI].tokens[tId].normalised) > -1 ||
 						sentences[sI].tokens[tId].pos_reason === 'copula-adjective' ||
 						sentences[sI].tokens[tId].pos_reason === 'ed'
 					) 	isPassive++;
 				});
 				if (isPassive > 0) sentences[sI].metrics.passiveVoiceTokens.push(ts);
 			})
 			
 			console.log( 'sentence end, verb groups raw: ', sentences[sI].metrics.groupTokens );
 			console.log( 'verb groups: ', readableTokens );
 			console.log( 'passive groups: ', sentences[sI].metrics.passiveVoiceTokens );
 		}
 		
 		sentencesCounts.push(sentence.tokens.length);		
 		
 	});
 	// TODO - find phrasal verbs
 	
 	
 	if (sentencesCounts.length > 0) {
 		// count number of words per sentence and its standard deviation
 		if (metrics.sentenceCount) {
 			// wordsPerSentence
 			if (sentencesCounts.length > 0) metrics.wordsPerSentence = (sentencesCounts.reduce(function(a, b) { return a + b; })) / metrics.sentenceCount;
 			// negationsPerSentence
 			if (negationsCount) metrics.negationsPerSentence = negationsCount / metrics.sentenceCount;
 		}
 		if (sentences.length >= 10) {
 			// wordsPerSentenceStd
 			metrics.wordsPerSentenceStd = sentencesCounts.average().deviation;
 		}
 	}
 		
 	// find extra long and short sentences
 	if (sentences.length) {
 		// longSentencesRatio
 		var _longs = sentencesCounts.filter(function(sCount){ 
 			if (sCount >= c.LONGSENTENCE) return 1;
 		});
 		metrics.longSentencesRatio = _longs.length / sentencesCounts.length; 
 		// shortSentencesRatio
 		var _shorts = sentencesCounts.filter(function(sCount){ 
 			if (sCount <= c.SHORTSENTENCE) return 1;
 		});
 		metrics.shortSentencesRatio = _shorts.length / sentencesCounts.length;
 		
 		if (metrics.sentenceCount) {
 			// count sentence types based on ending punctuation mark
 			// declarativeRatio, interrogativeRatio, exclamativeRatio
 			var types = sentences.map(function(s){ return s.type; });
 			['declarative', 'interrogative', 'exclamative'].forEach(function(type){ 
 				var typeCount = types.filter(function(v) { return v === type; }).length;
 				metrics[type.concat('Ratio')] = typeCount / metrics.sentenceCount;
 			});
 		}
 	}
 	// find vocabulary size
 	// vocabularySize
 	metrics.vocabularySize = stems.unique().length;
 	
 	// count number of characters in the whole RAW text
 	// characterCount
 	var d = txt.trim(); 
 	metrics.characterCount = d.length;
 	
 	var uselessBoundaries = d.match(/[\s\t]{2,}/g);
 	if (uselessBoundaries) {
 		var ub = uselessBoundaries.map(function(b) { return b.length; });
 		metrics.uselessBoundaries = ub.length;
 		metrics.characterCountTrimmed  = d.length - (ub.reduce(function(a, b) { return a + b; }) - ub.length);
 	} else {
 		metrics.characterCountTrimmed = d.length;
 	}
 	// counts per sentence
    if (metrics.sentenceCount) {
 		// count verb Groups
 		// verbGroupsPerSentence
 		var groupsCount = sentences.map(function(s){return s.metrics.groupTokens.length||0;}).reduce(function(a, b) {return a+b;});
 		metrics.verbGroupsPerSentence = groupsCount / metrics.sentenceCount;
 		// count passive voice cases
 		// passiveVoicePerSentence (special verb groups)
 		var passiveVoiceCount = sentences.map(function(s){return s.metrics.passiveVoiceTokens.length||0;}).reduce(function(a, b) {return a+b;});
 		metrics.passiveVoicePerSentence = passiveVoiceCount / metrics.sentenceCount;
 	}
 	// counts per word
    if (metrics.wordCount) {
 		// count number of syllables per word
 		// syllablesPerWord
 		if (syllablesCount) metrics.syllablesPerWord = syllablesCount/metrics.wordCount;
 		// count number of characters per word
 		// charactersPerWords
 		if (charactersPerWordsCounts) metrics.charactersPerWords = (charactersPerWordsCounts.reduce(function(a, b) {return a+b;})) / metrics.wordCount;
 		
 		// ratio for types of words, weak and vulgar words
 		['noun', 'pronoun', 'verb', 'adverb', 'adjective', 'modalVerb', 'weakVerb', 'vulgarWord', 'filler'].forEach(function(d) {
 			if (tCounts[d]) metrics[d.concat('Ratio')] = tCounts[d] / metrics.wordCount;
 		});
 		metrics.otherPosRatio = 1 - metrics.nounRatio - metrics.pronounRatio - metrics.verbRatio - metrics.adjectiveRatio - metrics.adverbRatio;
 	}
 	// counts per nouns
    if (tCounts.noun) {
 		// nounRatio
 		if (nounClusterCount) metrics.nounClusterRatio = nounClusterCount / tCounts.noun;
 		// nominalizationRatio and entitySubstitutionRatio : 
 		// TODO - make sure tCounts.noun contain what python NLT calls "pronoun_nonpossesive"
 		var nominCount = 0;
 		data.nominalizations.forEach(function(n, sI) { nominCount += n.filter(function(v){ return (v); }).length });
 		metrics.nominalizationRatio = nominCount / tCounts.noun;
 		var entitySubCount = 0;
 		data.entitySubstitutions.forEach(function(n, sI) { entitySubCount += n.filter(function(v){ return (v); }).length });
        metrics.entitySubstitutionRatio = entitySubCount / tCounts.noun;
 		
 	}
 	// estimate test readability using Flesch-Kincaid Grade Level test
 	// TODO short texts ...
 	if (/*(metrics.wordCount >= 100) &&*/ metrics.wordsPerSentence && metrics.syllablesPerWord) {
        metrics.readability = 0.39 * metrics.wordsPerSentence + 11.8 * metrics.syllablesPerWord - 15.59;
 	}
 	
 	// count number of stopwords
 	// stopwordRatio
 	
 	
 	
 	
 	/* TODO
 	+ Named-Entities (dynamic) !!!
 	? rare words / rareWordsRatio
 	
 	
 	//
 	# count number of stopwords
    data['stopwords'] = [None] * len(tokens)
    for idx, word in enumerate(words):
        if word in stopset:
            metrics['stopword_ratio'] += 1
            data['stopwords'][word2token_map[idx]] = True
        else:
            data['stopwords'][word2token_map[idx]] = False
    if metrics['wordCount']:
        metrics['stopword_ratio'] /= metrics['wordCount']


    # count rare words
    if len(words):
        metrics['rare_word_ratio'] = data['expected_word_frequencies'].count(0) / len(words)
    else:
        metrics['rare_word_ratio'] = 0

    # count word, bigram, and trigram frequencies
 	// ...
 	
 	
 	
 	// ???
 	# fix some verbs ending in -ing being counted as nouns
    for idx, token in enumerate(tokens):
        if (token[-3:] == 'ing') and (idx < len(tokens)) and (data['parts_of_speech'][idx+1] == 'IN'):
            data['parts_of_speech'][idx] = 'VBG'
 			
 	// ??? see below
    # find auxiliary verbs
    for i in range(verb_group_count):
        verb_group_stack = [idx for idx in range(len(tokens)) if data['verb_groups'][idx] == i+1]
        for j in verb_group_stack[:-1]:
            auxiliary_verbs[j] = True
 		
 	// ???
        data['weak_verbs'][idx] = (data['parts_of_speech'][idx][:2] == 'VB') and (data['stems'][idx] in dict_weak_verbs)
        if data['weak_verbs'][idx] and auxiliary_verbs[idx]:
            data['weak_verbs'][idx] = False
 	*/
 	stems = null;
 	return metrics;
 }

 var toPercent = function(o) {
 	var percentMetrics = {};
 	for (var k in o) {
 		percentMetrics[k.replace('Ratio', 'Percent')] = decimals( (k.indexOf('Ratio')<0) ? o[k] : ((o[k]) ? o[k]*100 : 0) );
 	}
 	return percentMetrics;
 }


 var metrics = calculateMetrics(TEST);
 console.log( metrics );
 console.log( toPercent(metrics) );



 /* appendix, reasoning
 NN
 "before a modal"			//if it's before a modal verb, it's a noun -> lkjsdf would
 "determiner-verb"			//if it's after a determiner, it's not a verb -> the walk
 "capitalised"				//it has a capital and isn't first word
 "need one verb" 			//if there no verb in the sentence, there needs to be.

 VB
 "after an adverb"			//if it's after an adverb, it's not a noun -> quickly acked
 "ed" 						//set ambiguous 'ed' endings as either verb/adjective

 RB
 "consecutive_adjectives" 	//no consecutive, unpunctuated adjectives -> real good

 JJ
 "copula-adjective" 			//copulas are followed by a determiner ("are a .."), or an adjective ("are good")
 "copula-adverb-adjective" 	//copula, adverb, verb -> copula adverb adjective -> is very lkjsdf

 UH
 "wordless_string" 			//punctuation - like ' -- ' etc.

 CD
 "parsefloat" 				//see if it's a number
 ---

 lex
 "lexicon" 					//known words list

 parts_of_speech[wordnet_suffixes[suffix]]
 "wordnet suffix"			//suffix pos signals from wordnet

 r
 "regex suffix" 				// suffix regexes for words

 // + last pass
 sentence.tokens = sentence.tokens.map(function(token, i) {
 	var next = sentence.tokens[i + 1]
 	var prev = sentence.tokens[i - 1]
 	if (token.pos) {
 		//suggest noun after determiners (a|the), posessive pronouns (her|my|its)
 		if (token.pos.tag == "DT" || token.pos.tag == "PP") {
 			need = 'NN'
 			reason = token.pos.name
 		}
 		//suggest verb after personal pronouns (he|she|they), modal verbs (would|could|should)
 		if (token.pos.tag == "PRP" || token.pos.tag == "MD") {
 			need = 'VB'
 			reason = token.pos.name
 		}

 	}
 	if (need && !token.pos) {
 		token.pos = parts_of_speech[need]
 		token.pos_reason = "signal from " + reason
 	}
 	if (need == 'VB' && token.pos.parent == 'verb') {
 		need = null
 	}
 	if (need == 'NN' && token.pos.parent == 'noun') {
 		need = null
 	}
 	return token
 })
 */
diff --git a/index.js b/index.js
 // nlp_comprimise by @spencermountain  in 2014
 // most files are self-contained modules that optionally export for nodejs
 // this file loads them all together

 // if we're server-side, grab files, otherwise assume they're prepended already
 if (typeof module !== "undefined" && module.exports) {

  var parents = require("./src/parents/parents")

  var sentence_parser = require('./src/methods/tokenization/sentence').sentences;
  var tokenize = require('./src/methods/tokenization/tokenize').tokenize;
  var ngram = require('./src/methods/tokenization/ngram').ngram;
  //tokenize
  var normalize = require('./src/methods/transliteration/unicode_normalisation')
  var syllables = require('./src/methods/syllables/syllable');
  //localization
  var local = require('./src/methods/localization/britishize')
  var americanize = local.americanize;
  var britishize = local.britishize;
  //part of speech tagging
  var pos = require('./src/pos');
  //named_entity_recognition
  var spot = require('./src/spot');
  //weak verbs, vulgar words etc. TODO - goes to metrics ...
  var bl = require('./src/data/blacklist');
 }

 ///
 // api
 var nlp = {
  noun: parents.noun,
  adjective: parents.adjective,
  verb: parents.verb,
  adverb: parents.adverb,
  value: parents.value,

  sentences: sentence_parser,
  ngram: ngram,
  tokenize: tokenize,
  americanize: americanize,
  britishize: britishize,
  syllables: syllables,
  normalize: normalize.normalize,
  denormalize: normalize.denormalize,
  pos: pos,
  spot: spot,
  blacklist: bl
  // tests: tests,
 };

 //export it for server-side
 if (typeof module !== "undefined" && module.exports) {
  module.exports = nlp;
 }

 // bump bower
 // git tag -a v0.3.5 -m "tag bower release"
 // git push origin master --tags

 // console.log( nlp.pos('she sells seashells by the seashore').sentences[0].negate().text() )
 // console.log( nlp.pos('i will slouch').to_past().text() )
	var blacklist = {
	weaks: [
	"be",
	"am",
	"is",
	"are",
	"wa",
	"were",
	"been",
	"have",
	"do",
	"say",
	"go",
	"see",
	"give",
	"know",
	"want",
	"put",
	"seem",
	"stay",
	"speak",
	"find",
	"come",
	"think",
	"leav",
	"take",
	"feel",
	"watch",
	"begin",
	"hope",
	"exist",
	"work",
	"produc",
	"occur",
	"understand",
	"receiv",
	"appear",
	"serv",
	"need",
	"maintain",
	"chang",
	"introduc",
	"creat",
	"open",
	"consider",
	"hear",
	"finish",
	"convert",
	"form",
	"bring",
	"achiev",
	"suppos",
	"get",
	"got",
	"reach",
	"run",
	"ran",
	"use",
	"help",
	"show",
	"move",
	"happen",
	"fix",
	"set"
	],
	fillers: [
	"absolutely",
	"actual",
	"actually",
	"anyway",
	"apparently",
	"approximately",
	"badly",
	"basically",
	"begin",
	"certainly",
	"clearly",
	"completely",
	"definitely",
	"easily",
	"effectively",
	"entirely",
	"especially",
	"essentially",
	"exactly",
	"extremely",
	"fairly",
	"frankly",
	"frequently",
	"fully",
	"generally",
	"hardly",
	"heavily",
	"highly",
	"hopefully",
	"just",
	"largely",
	"like",
	"literally",
	"maybe",
	"might",
	"most",
	"mostly",
	"much",
	"necessarily",
	"nicely",
	"obviously",
	"ok",
	"okay",
	"particularly",
	"perhaps",
	"possibly",
	"practically",
	"primarily",
	"probably",
	"precisely",
	"quite",
	"rather",
	"real",
	"really",
	"relatively",
	"right",
	"seriously",
	"significantly",
	"simply",
	"slightly",
	"so",
	"specifically",
	"start",
	"strongly",
	"surely",
	"too",
	"totally",
	"truly",
	"try",
	"typically",
	"ultimately",
	"usually",
	"very",
	"virtually",
	"whatever",
	"well",
	"whenever",
	"wherever",
	"whoever",
	"widely"
	],
	vulgars: [
	"anal",
	"anus",
	"arabush",
	"arse",
	"arsehole",
	"ass",
	"asshole",
	"ballsack",
	"balls",
	"bastard",
	"bitch",
	"biatch",
	"bloody",
	"blowjob",
	"blow job",
	"bluegum",
	"bollock",
	"bollok",
	"boner",
	"boob",
	"bugger",
	"bum",
	"butt",
	"buttcrack",
	"buttplug",
	"chinaman",
	"clit",
	"clitoris",
	"cock",
	"cocksucker",
	"coon",
	"crap",
	"cunt",
	"damn",
	"dick",
	"dickhead",
	"dildo",
	"dyke",
	"fag",
	"feck",
	"fellate",
	"fellatio",
	"felching",
	"fuck",
	"fuckhead",
	"f u c k",
	"fudgepacker",
	"fudge packer",
	"flange",
	"goddamn",
	"gable",
	"god damn",
	"handjob",
	"hell",
	"homo",
	"jerk",
	"jizz",
	"knobend",
	"knob end",
	"labia",
	"lmao",
	"lmfao",
	"muff",
	"nigger",
	"nigga",
	"niggar",
	"omg",
	"penis",
	"piss",
	"poop",
	"prick",
	"pube",
	"pussy",
	"queer",
	"scrotum",
	"shit",
	"s hit",
	"sh1t",
	"slut",
	"smegma",
	"spunk",
	"sucker",
	"tit",
	"tosser",
	"turd",
	"twat",
	"vagina",
	"wank",
	"whore",
	"wtf"
	]
	};

	var main = {};
	main.weak = new RegExp( '^'.concat(blacklist.weaks.join('\|^')), 'gi' );
	main.filler = new RegExp( '^'.concat(blacklist.fillers.join('$\|^'), '$'), 'gi' );
	main.vulgar = new RegExp( '^'.concat(blacklist.vulgars.join('\|^')), 'gi' );

	if (typeof module !== "undefined" && module.exports) {
	module.exports = main;
	}
	// nlp_comprimise by @spencermountain in 2014
	// most files are self-contained modules that optionally export for nodejs
	// this file loads them all together

	// if we're server-side, grab files, otherwise assume they're prepended already
	if (typeof module !== "undefined" && module.exports) {

	var parents = require("./src/parents/parents")

	var sentence_parser = require('./src/methods/tokenization/sentence').sentences;
	var tokenize = require('./src/methods/tokenization/tokenize').tokenize;
	var ngram = require('./src/methods/tokenization/ngram').ngram;
	//tokenize
	var normalize = require('./src/methods/transliteration/unicode_normalisation')
	var syllables = require('./src/methods/syllables/syllable');
	//localization
	var local = require('./src/methods/localization/britishize')
	var americanize = local.americanize;
	var britishize = local.britishize;
	//part of speech tagging
	var pos = require('./src/pos');
	//named_entity_recognition
	var spot = require('./src/spot');
	//weak verbs, vulgar words etc. TODO - goes to metrics ...
	var bl = require('./src/data/blacklist');
	}

	///
	// api
	var nlp = {
	noun: parents.noun,
	adjective: parents.adjective,
	verb: parents.verb,
	adverb: parents.adverb,
	value: parents.value,

	sentences: sentence_parser,
	ngram: ngram,
	tokenize: tokenize,
	americanize: americanize,
	britishize: britishize,
	syllables: syllables,
	normalize: normalize.normalize,
	denormalize: normalize.denormalize,
	pos: pos,
	spot: spot,
	blacklist: bl
	// tests: tests,
	};

	//export it for server-side
	if (typeof module !== "undefined" && module.exports) {
	module.exports = nlp;
	}

	// bump bower
	// git tag -a v0.3.5 -m "tag bower release"
	// git push origin master --tags

	// console.log( nlp.pos('she sells seashells by the seashore').sentences[0].negate().text() )
	// console.log( nlp.pos('i will slouch').to_past().text() )