Last active
February 12, 2024 23:18
-
-
Save redaktor/e5866669e238221e7cef to your computer and use it in GitHub Desktop.
nlp_compromise metrics proposal as standalone example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var blacklist = { | |
weaks: [ | |
"be", | |
"am", | |
"is", | |
"are", | |
"wa", | |
"were", | |
"been", | |
"have", | |
"do", | |
"say", | |
"go", | |
"see", | |
"give", | |
"know", | |
"want", | |
"put", | |
"seem", | |
"stay", | |
"speak", | |
"find", | |
"come", | |
"think", | |
"leav", | |
"take", | |
"feel", | |
"watch", | |
"begin", | |
"hope", | |
"exist", | |
"work", | |
"produc", | |
"occur", | |
"understand", | |
"receiv", | |
"appear", | |
"serv", | |
"need", | |
"maintain", | |
"chang", | |
"introduc", | |
"creat", | |
"open", | |
"consider", | |
"hear", | |
"finish", | |
"convert", | |
"form", | |
"bring", | |
"achiev", | |
"suppos", | |
"get", | |
"got", | |
"reach", | |
"run", | |
"ran", | |
"use", | |
"help", | |
"show", | |
"move", | |
"happen", | |
"fix", | |
"set" | |
], | |
fillers: [ | |
"absolutely", | |
"actual", | |
"actually", | |
"anyway", | |
"apparently", | |
"approximately", | |
"badly", | |
"basically", | |
"begin", | |
"certainly", | |
"clearly", | |
"completely", | |
"definitely", | |
"easily", | |
"effectively", | |
"entirely", | |
"especially", | |
"essentially", | |
"exactly", | |
"extremely", | |
"fairly", | |
"frankly", | |
"frequently", | |
"fully", | |
"generally", | |
"hardly", | |
"heavily", | |
"highly", | |
"hopefully", | |
"just", | |
"largely", | |
"like", | |
"literally", | |
"maybe", | |
"might", | |
"most", | |
"mostly", | |
"much", | |
"necessarily", | |
"nicely", | |
"obviously", | |
"ok", | |
"okay", | |
"particularly", | |
"perhaps", | |
"possibly", | |
"practically", | |
"primarily", | |
"probably", | |
"precisely", | |
"quite", | |
"rather", | |
"real", | |
"really", | |
"relatively", | |
"right", | |
"seriously", | |
"significantly", | |
"simply", | |
"slightly", | |
"so", | |
"specifically", | |
"start", | |
"strongly", | |
"surely", | |
"too", | |
"totally", | |
"truly", | |
"try", | |
"typically", | |
"ultimately", | |
"usually", | |
"very", | |
"virtually", | |
"whatever", | |
"well", | |
"whenever", | |
"wherever", | |
"whoever", | |
"widely" | |
], | |
vulgars: [ | |
"anal", | |
"anus", | |
"arabush", | |
"arse", | |
"arsehole", | |
"ass", | |
"asshole", | |
"ballsack", | |
"balls", | |
"bastard", | |
"bitch", | |
"biatch", | |
"bloody", | |
"blowjob", | |
"blow job", | |
"bluegum", | |
"bollock", | |
"bollok", | |
"boner", | |
"boob", | |
"bugger", | |
"bum", | |
"butt", | |
"buttcrack", | |
"buttplug", | |
"chinaman", | |
"clit", | |
"clitoris", | |
"cock", | |
"cocksucker", | |
"coon", | |
"crap", | |
"cunt", | |
"damn", | |
"dick", | |
"dickhead", | |
"dildo", | |
"dyke", | |
"fag", | |
"feck", | |
"fellate", | |
"fellatio", | |
"felching", | |
"fuck", | |
"fuckhead", | |
"f u c k", | |
"fudgepacker", | |
"fudge packer", | |
"flange", | |
"goddamn", | |
"gable", | |
"god damn", | |
"handjob", | |
"hell", | |
"homo", | |
"jerk", | |
"jizz", | |
"knobend", | |
"knob end", | |
"labia", | |
"lmao", | |
"lmfao", | |
"muff", | |
"nigger", | |
"nigga", | |
"niggar", | |
"omg", | |
"penis", | |
"piss", | |
"poop", | |
"prick", | |
"pube", | |
"pussy", | |
"queer", | |
"scrotum", | |
"shit", | |
"s hit", | |
"sh1t", | |
"slut", | |
"smegma", | |
"spunk", | |
"sucker", | |
"tit", | |
"tosser", | |
"turd", | |
"twat", | |
"vagina", | |
"wank", | |
"whore", | |
"wtf" | |
] | |
}; | |
var main = {}; | |
main.weak = new RegExp( '^'.concat(blacklist.weaks.join('|^')), 'gi' ); | |
main.filler = new RegExp( '^'.concat(blacklist.fillers.join('$|^'), '$'), 'gi' ); | |
main.vulgar = new RegExp( '^'.concat(blacklist.vulgars.join('|^')), 'gi' ); | |
if (typeof module !== "undefined" && module.exports) { | |
module.exports = main; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// TODO - make logic_negate and abbreviations to lexicon as resource file (i18n, language aware, seperate data and logic) | |
// the best way might be a dictionary with flags where we can easily derive the lexicon by Object.keys and map, like | |
/* dictionary: { | |
"CP": [ | |
{v:'is', weak: 1}, | |
... | |
], | |
... | |
}; | |
*/ | |
var nlp = require('nlp_compromise'); | |
var util = require('util'); | |
//var TEST = 'The cats we saw, e.g. tigers or leopards, are nice. I am a perfect second sentence for them. This is actually not. We\'re exclamative! Let us look back. They were beaten.'; | |
var TEST = 'He was told that they have been hardly wounded.' | |
/* TODO - options, like "optimize metrics for" | |
// example: | |
// TICKS, e.g. LONGSENTENCE <: | |
news / mobile 18 | |
story / desktop 25 | |
longread 30 | |
scientific text 45 | |
*/ | |
/* | |
TODO - use important rules from the stylebooks of AP, APA (en) and dpa (de) | |
e.g.: | |
+ (related to dates) : ages: | |
For ages, always use figures. If the age is used as an adjective or as a substitute for a noun, then it should be hyphenated. Don't use apostrophes when describing an age range. | |
Examples: A 21-year-old student. The student is 21 years old. The girl, 8, has a brother, 11. The contest is for 18-year-olds. He is in his 20s. | |
Please note, that medical and political titles only need to be used on first reference when they appear outside of a direct quote. | |
For courtesy titles, use these on second reference or when specifically requested. | |
Other acronyms and abbreviations are acceptable but not required (i.e. FBI, CIA, GOP). The context should govern such decisions. Avoid "alphapet soup" ... | |
Use quotation marks around the titles of books, songs, television shows, computer games, poems, lectures, speeches and works of art. | |
Examples: Author Porter Shreve read from his new book, "When the White House Was Ours." They sang "The Star-Spangled Banner" before the game. | |
Do not use quotations around the names of magazine, newspapers, the Bible or books that are catalogues of reference materials. | |
Examples: The Washington Post first reported the story. He reads the Bible every morning. | |
When used with a date, abbreviate only the following months: Jan., Feb., Aug., Sept., Oct., Nov. and Dec. | |
*/ | |
var c = { | |
LONGSENTENCE: 40, | |
SHORTSENTENCE: 5, | |
}; | |
// TODO FIXME - should go to lexicon | |
/* | |
NOTE: better performance when we use the following additional tagging already when stemming: | |
'AUX': 'auxillary verbs' | |
'WDT': 'wh-determiner', // WHICH, WHAT, WHOSE | |
'WP': 'wh-pronoun', // WHICH, WHAT, WHO, WHOM | |
'WRB': 'wh-adverb', // HOW, WHEN, WHENCE, WHERE, WHY | |
'TO': 'to', // ? | |
'RP': 'Particle', // it would be useful if there is RPP for positive particles and RPN for negative | |
// and if there would be an "opposite" mapping ... | |
// note currently only "not" is handled and it stems as a "CC" | |
'LS': 'List item marker', | |
'PDT': 'Predeterminer', | |
'POS': 'Possessive ending', | |
'SYM': 'Symbol (mathematical or scientific)', | |
':': 'colon', | |
'(': 'open parenthesis', | |
'``': 'open quote', | |
"''": 'close quote', | |
'#': 'pound sign (currency marker)', | |
'$': 'dollar sign (currency marker)', | |
')': 'close parenthesis', | |
',': 'comma', | |
'.': 'period' | |
// ? | |
'WP$': 'Possessive wh-pronoun', // how about demonstrativePronouns ? | |
*/ | |
// auxillary verbs | |
var auxVerbs = ['do', 'does', 'did', 'have', 'has', 'had', 'having', 'be', 'is', 'am', 'are', 'was', 'were', 'been', 'being', 'shall', 'will', 'should', 'would', 'can', 'could', 'may', 'might', 'must']; | |
// auxillary verbs and other verbs in verb groups; | |
var verbGroups = [ | |
// first item is already known as any verb or auxVerb | |
// TODO better: pos_reason VB verb ed | |
{ | |
aux: ['have', 'has', 'had', 'having'], | |
verbs: /(en$)|(ed$)/ | |
}, | |
{ | |
aux: ['is', 'am', 'are', 'was', 'were', 'been', 'be', 'being', 'to be'], | |
verbs: /ing$/ | |
}, | |
{ | |
aux: ['is', 'am', 'are', 'was', 'were', 'been', 'be', 'to be'], | |
verbs: /(en$)|(ed$)/ | |
} | |
// last item SHOULD be a verb except auxVerbs or 'copula-adjective' - TODO - How to express in lexicon ? | |
]; | |
// passive voice | |
var passiveVoiceAux = ["am", "is", "are", "was", "were", "be", "been", "being"]; | |
// subset of determiners | |
var demonstrativePronouns = ['this', 'that', 'these', 'those', 'such', 'none', 'neither']; | |
var specialDemonstrativePronouns = ['this', 'that']; | |
var whDeterminers = ['which', 'what', 'whose']; | |
// other wh-stuff, see http://www.garfixia.nl/k/news/view/1442/15/the-what-why-and-how-of-wh-words.html | |
var whPronouns = ['which', 'what', 'who', 'whom']; | |
var whAdverbs = ['how', 'when', 'whence', 'where', 'why']; | |
// entity substitutions | |
var entitySubstitutions = ['it', 'he', 'him', 'she', 'her', 'i', 'me', 'we', 'us', 'they', 'them', 'you', 'there', 'here', 'thing', 'stuff', 'fact', 'this', 'that']; | |
// nominalizations | |
var nominalizationRe = new RegExp('(?:ion|ions|ism|isms|ty|ties|ment|ments|ness|nesses|ance|ances|ence|ences)$'); | |
// end ^ TODO FIXME - should go to lexicon | |
// EXTEND ARRAY PROTOTYPE | |
Array.prototype.average = function() { | |
// TODO - in other contexts we MUST handle values other than typeof 'number' !!! | |
var r = {mean: 0, variance: 0, deviation: 0}, t = this.length; | |
for(var m, s = 0, l = t; l--; s += this[l]); | |
for(m = r.mean = s / t, l = t, s = 0; l--; s += Math.pow(this[l] - m, 2)); | |
return r.deviation = Math.sqrt(r.variance = s / t), r; | |
}; | |
Array.prototype.unique = function() { | |
return this.reduce(function(p, c) { | |
if (p.indexOf(c) < 0) p.push(c); | |
return p; | |
}, []); | |
}; | |
Array.prototype.sequences = function() { | |
var lastI = -1; | |
var results = [[]]; | |
var that = this; | |
this.forEach(function(i, j) { | |
if (i != lastI+1 && lastI>-1) results.push([]); | |
results[results.length - 1].push(i); | |
lastI = i; | |
}); | |
return results; | |
}; | |
function decimals(f, dec) { | |
// TODO, v2 | |
// stub, currently used for toPercent which will become readable with | |
// decimals rounded and percent values and value/unit etc. | |
// we SHOULD round the 2nd decimal ... | |
if (!dec) dec = 2; | |
return parseFloat(f.toFixed(2)); | |
} | |
function calculateMetrics(txt) { | |
var processed = nlp.pos(txt); | |
var metrics = { | |
sentenceCount: 0, | |
wordCount: 0, | |
characterCount: 0, | |
characterCountTrimmed: 0, | |
uselessBoundaries: 0, | |
vocabularySize: 0, | |
wordsPerSentence: 0, | |
wordsPerSentenceStd: -1, | |
longSentencesRatio: 0, | |
shortSentencesRatio: 0, | |
declarativeRatio: 0, | |
interrogativeRatio: 0, | |
exclamativeRatio: 0, | |
charactersPerWords: 0, | |
syllablesPerWord: 0, | |
negationsPerSentence: 0, | |
stopwordRatio: 0, | |
nounRatio: 0, | |
nounClusterRatio: 0, | |
pronounRatio: 0, | |
verbRatio: 0, | |
adjectiveRatio: 0, | |
adverbRatio: 0, | |
otherPosRatio: 0, | |
modalRatio: 0, | |
nominalizationRatio: 0, | |
entitySubstitutionRatio: 0, | |
weakVerbRatio: 0, | |
vulgarWordRatio: 0, | |
verbGroupsPerSentence: 0, | |
passiveVoicePerSentence: 0, | |
fillerRatio: 0, | |
readability: 0 | |
}; | |
var sentences = processed.sentences; | |
// count number of sentences | |
// sentenceCount | |
metrics.sentenceCount = sentences.length; | |
var stems = []; | |
var sentencesCounts = []; | |
var charactersPerWordsCounts = []; | |
var syllablesCount = 0; | |
var negationsCount = 0; | |
// depends on other nouns | |
var nounClusterCount = 0; | |
// depends on wordCount | |
var tCounts = { | |
noun: 0, | |
pronoun: 0, | |
pronounNonpossesive: 0, | |
verb: 0, | |
adverb: 0, | |
adjective: 0, | |
modalVerb: 0, | |
weakVerb: 0, | |
vulgarWord: 0, | |
filler: 0 | |
}; | |
// question: we have 1 minor issue with the TAGS: | |
// "CP" is a copula verb but a verb. We think it is e.g. different from noun/pronoun relation - SHOULD it be called VCP ??? | |
var _types = { N: 'noun', P: 'pronoun', V: 'verb', C: 'verb', R: 'adverb', J: 'adjective', M: 'modalVerb' }; | |
// for further calculation purposes | |
var data = { | |
nominalizations: [], | |
entitySubstitutions:[] | |
}; | |
var nounCluster = function(token, _nounsCount) { | |
if (!_nounsCount || _nounsCount < 1) _nounsCount = token.normalised.match(/\S+/g).length; | |
// count clustered nouns (3 with possibly 'of') | |
var n = token.analysis.next; | |
if (n && _nounsCount < 10 && (n.pos.tag.slice(0,1) === 'N' || n.normalised === 'of')) { | |
if (n.normalised != 'of') _nounsCount++; | |
nounCluster(token, _nounsCount); | |
} else if (_nounsCount > 2) { | |
return _nounsCount; | |
} else { | |
return 0; | |
} | |
} | |
var verbGroupBegin = function(o) { | |
return (o.hasOwnProperty('pos') && o.analysis.next && (o.pos.parent === 'verb' || auxVerbs.indexOf(o.normalised) > -1)); | |
} | |
var verbGroupEnd = function(o) { | |
return (o.hasOwnProperty('pos') && (o.pos.parent === 'verb' && auxVerbs.indexOf(o.normalised) < 0) || o.pos_reason === 'copula-adjective'); | |
} | |
sentences.forEach(function(sentence, sI) { | |
//var sText = sentence.text(); | |
console.log( '!s', sentence.text() ); | |
sentences[sI].metrics = {}; | |
// count number of words | |
// wordCount | |
data.nominalizations[sI] = []; | |
data.entitySubstitutions[sI] = []; | |
if (!(sentences[sI].hasOwnProperty('groupTokens'))) sentences[sI].metrics.groupTokens = []; | |
metrics.wordCount = metrics.wordCount+sentence.tokens.length; | |
// count verb groups | |
// handled rule group id and last group token | |
var l = 0; | |
var groupId = 0; | |
var last = {i:0}; | |
var missingEnd = false; | |
stems = stems.concat(sentence.tokens.map(function(token, i){ | |
if (!(sentences[sI].metrics.groupTokens.length)) sentences[sI].metrics.groupTokens.push([]); | |
l = (sentences[sI].metrics.groupTokens.length); | |
// count verb groups | |
if ( (!(last.i) || last.i < i) && verbGroupBegin(token)) { | |
// could be a normalized verb group | |
// note: does not cover phrasal verbs | |
var next = token.analysis.next; | |
var iNext = i+1; | |
verbGroups.every(function(group, gI) { | |
if (gI >= groupId) { | |
if ((group.aux.indexOf(next.normalised) > -1 || group.verbs.test(next.normalised) || next.pos_reason === 'copula-adjective')) { | |
groupId = gI; | |
sentences[sI].metrics.groupTokens[l-1].push(i); | |
sentences[sI].metrics.groupTokens[l-1].push(iNext); | |
last = sentence.tokens[iNext]; | |
last.i = iNext; | |
return false; | |
} | |
} | |
}); | |
} | |
console.log( last.i, i ); | |
// seperate multiple verb groups TODO TEST - "special clusters" | |
l = (sentences[sI].metrics.groupTokens.length); | |
if (last.i === i && verbGroupEnd(token)) { | |
groupId = 0; | |
sentences[sI].metrics.groupTokens.push([]); | |
} else if (i > 0 && last.i != i && !verbGroupEnd(last)) { | |
console.log( 'hasEnd', verbGroupEnd(last), token.text ); | |
if (verbGroupEnd(token)) { | |
console.log( 'Could be End: ', token.text ); | |
sentences[sI].metrics.groupTokens[l-1].push(i); | |
groupId = 0; | |
sentences[sI].metrics.groupTokens.push([]); | |
} | |
} | |
//console.log(token.pos.tag, token.normalised, token.pos_reason/*, token*/); | |
// TODO - ISSUE with negation logic_negate just works in one direction FIXME CONTRIB | |
// test http://rawgit.com/spencermountain/nlp_compromise/master/client_side/basic_demo/index.html : | |
// example: joe never swims in the pool. | |
if (token.analysis.negative) negationsCount++; | |
//console.log( 'token: ', token ); | |
data.entitySubstitutions[sI][i] = (token.normalised != 'i' && (entitySubstitutions.indexOf(token.normalised) > -1) && !(token.capitalised)); | |
if (data.entitySubstitutions[sI][i] && specialDemonstrativePronouns.indexOf(token.normalised) > -1) { | |
if (token.analysis.last) { | |
var firsttwo = token.analysis.last.pos.tag.slice(0,2); | |
if (['NN', 'PR'].indexOf(firsttwo) > -1) data.entitySubstitutions[sI][i] = false; | |
} | |
if (token.analysis.next) { | |
var firsttwo = token.analysis.next.pos.tag.slice(0,2); | |
if (['NN', 'PR', 'JJ', 'DT'].indexOf(firsttwo) > -1) data.entitySubstitutions[sI][i] = false; | |
/*, 'WD', 'WP' // see above TODO, handled below*/ | |
if (whDeterminers.concat(whPronouns).indexOf(token.normalised) > -1) data.entitySubstitutions[sI][i] = false; | |
} | |
} | |
if (data.entitySubstitutions[sI][i]) sentences[sI].tokens[i].metrics.entitySubstitution = true; | |
// count number of different parts of speech | |
var typeId = token.pos.tag.slice(0,1); | |
console.log( 'token: ', token.text, token.pos.tag, token.pos.parent, token.pos_reason ); | |
//console.log( 'token3: ',typeId, _types[typeId] ); | |
if (_types.hasOwnProperty(typeId)) tCounts[_types[typeId]]++; | |
// count characters per words | |
charactersPerWordsCounts.push(token.text.length); | |
data.nominalizations[sI][i] = false; | |
if (typeId === 'N') { | |
// count clustered nouns | |
var curClusterCount = nounCluster(token); | |
if (curClusterCount) nounClusterCount += curClusterCount; | |
// count nominalizations | |
var isNNP = token.pos.tag.indexOf('NNP' === 0); | |
if (isNNP) data.nominalizations[sI][i] = (token.text.length > 7) && (token.normalised.search(nominalizationRe)); | |
} | |
if (data.nominalizations[sI][i]) sentences[sI].tokens[i].metrics.nominalization = true; | |
if (typeId === 'V') { | |
// count weak verbs | |
var check = (token.pos.tense === 'present') ? token.normalised : token.analysis.conjugate().infinitive; | |
if (nlp.blacklist.weak.test(check)) tCounts.weakVerb++; | |
} | |
// count vulgar words, fillers etc. | |
if (nlp.blacklist.vulgar.test(token.normalised)) tCounts.vulgarWord++; | |
if (nlp.blacklist.filler.test(token.normalised)) tCounts.filler++; | |
var syllables = nlp.syllables(token.text); | |
if (syllables) syllablesCount = syllablesCount + syllables.length; | |
return token.normalised; | |
})); | |
if (sentences[sI].metrics.groupTokens.length) sentences[sI].metrics.groupTokens = sentences[sI].metrics.groupTokens.filter(function(ts) { | |
return (ts.length); | |
}); | |
if (sentences[sI].metrics.groupTokens.length) { | |
// we found verb groups ... | |
var readableTokens = sentences[sI].metrics.groupTokens.map(function(ts) { | |
return ts.map(function(tId) { return sentences[sI].tokens[tId].normalised }).join(' '); | |
}); | |
sentences[sI].metrics.passiveVoiceTokens = []; | |
sentences[sI].metrics.groupTokens.forEach(function(ts, i) { | |
var isPassive = -1; | |
ts.forEach(function(tId) { | |
sentences[sI].tokens[tId].verbGroup = i; | |
console.log( sentences[sI].tokens[tId].pos_reason ); | |
if (passiveVoiceAux.indexOf(sentences[sI].tokens[tId].normalised) > -1 || | |
sentences[sI].tokens[tId].pos_reason === 'copula-adjective' || | |
sentences[sI].tokens[tId].pos_reason === 'ed' | |
) isPassive++; | |
}); | |
if (isPassive > 0) sentences[sI].metrics.passiveVoiceTokens.push(ts); | |
}) | |
console.log( 'sentence end, verb groups raw: ', sentences[sI].metrics.groupTokens ); | |
console.log( 'verb groups: ', readableTokens ); | |
console.log( 'passive groups: ', sentences[sI].metrics.passiveVoiceTokens ); | |
} | |
sentencesCounts.push(sentence.tokens.length); | |
}); | |
// TODO - find phrasal verbs | |
if (sentencesCounts.length > 0) { | |
// count number of words per sentence and its standard deviation | |
if (metrics.sentenceCount) { | |
// wordsPerSentence | |
if (sentencesCounts.length > 0) metrics.wordsPerSentence = (sentencesCounts.reduce(function(a, b) { return a + b; })) / metrics.sentenceCount; | |
// negationsPerSentence | |
if (negationsCount) metrics.negationsPerSentence = negationsCount / metrics.sentenceCount; | |
} | |
if (sentences.length >= 10) { | |
// wordsPerSentenceStd | |
metrics.wordsPerSentenceStd = sentencesCounts.average().deviation; | |
} | |
} | |
// find extra long and short sentences | |
if (sentences.length) { | |
// longSentencesRatio | |
var _longs = sentencesCounts.filter(function(sCount){ | |
if (sCount >= c.LONGSENTENCE) return 1; | |
}); | |
metrics.longSentencesRatio = _longs.length / sentencesCounts.length; | |
// shortSentencesRatio | |
var _shorts = sentencesCounts.filter(function(sCount){ | |
if (sCount <= c.SHORTSENTENCE) return 1; | |
}); | |
metrics.shortSentencesRatio = _shorts.length / sentencesCounts.length; | |
if (metrics.sentenceCount) { | |
// count sentence types based on ending punctuation mark | |
// declarativeRatio, interrogativeRatio, exclamativeRatio | |
var types = sentences.map(function(s){ return s.type; }); | |
['declarative', 'interrogative', 'exclamative'].forEach(function(type){ | |
var typeCount = types.filter(function(v) { return v === type; }).length; | |
metrics[type.concat('Ratio')] = typeCount / metrics.sentenceCount; | |
}); | |
} | |
} | |
// find vocabulary size | |
// vocabularySize | |
metrics.vocabularySize = stems.unique().length; | |
// count number of characters in the whole RAW text | |
// characterCount | |
var d = txt.trim(); | |
metrics.characterCount = d.length; | |
var uselessBoundaries = d.match(/[\s\t]{2,}/g); | |
if (uselessBoundaries) { | |
var ub = uselessBoundaries.map(function(b) { return b.length; }); | |
metrics.uselessBoundaries = ub.length; | |
metrics.characterCountTrimmed = d.length - (ub.reduce(function(a, b) { return a + b; }) - ub.length); | |
} else { | |
metrics.characterCountTrimmed = d.length; | |
} | |
// counts per sentence | |
if (metrics.sentenceCount) { | |
// count verb Groups | |
// verbGroupsPerSentence | |
var groupsCount = sentences.map(function(s){return s.metrics.groupTokens.length||0;}).reduce(function(a, b) {return a+b;}); | |
metrics.verbGroupsPerSentence = groupsCount / metrics.sentenceCount; | |
// count passive voice cases | |
// passiveVoicePerSentence (special verb groups) | |
var passiveVoiceCount = sentences.map(function(s){return s.metrics.passiveVoiceTokens.length||0;}).reduce(function(a, b) {return a+b;}); | |
metrics.passiveVoicePerSentence = passiveVoiceCount / metrics.sentenceCount; | |
} | |
// counts per word | |
if (metrics.wordCount) { | |
// count number of syllables per word | |
// syllablesPerWord | |
if (syllablesCount) metrics.syllablesPerWord = syllablesCount/metrics.wordCount; | |
// count number of characters per word | |
// charactersPerWords | |
if (charactersPerWordsCounts) metrics.charactersPerWords = (charactersPerWordsCounts.reduce(function(a, b) {return a+b;})) / metrics.wordCount; | |
// ratio for types of words, weak and vulgar words | |
['noun', 'pronoun', 'verb', 'adverb', 'adjective', 'modalVerb', 'weakVerb', 'vulgarWord', 'filler'].forEach(function(d) { | |
if (tCounts[d]) metrics[d.concat('Ratio')] = tCounts[d] / metrics.wordCount; | |
}); | |
metrics.otherPosRatio = 1 - metrics.nounRatio - metrics.pronounRatio - metrics.verbRatio - metrics.adjectiveRatio - metrics.adverbRatio; | |
} | |
// counts per nouns | |
if (tCounts.noun) { | |
// nounRatio | |
if (nounClusterCount) metrics.nounClusterRatio = nounClusterCount / tCounts.noun; | |
// nominalizationRatio and entitySubstitutionRatio : | |
// TODO - make sure tCounts.noun contain what python NLT calls "pronoun_nonpossesive" | |
var nominCount = 0; | |
data.nominalizations.forEach(function(n, sI) { nominCount += n.filter(function(v){ return (v); }).length }); | |
metrics.nominalizationRatio = nominCount / tCounts.noun; | |
var entitySubCount = 0; | |
data.entitySubstitutions.forEach(function(n, sI) { entitySubCount += n.filter(function(v){ return (v); }).length }); | |
metrics.entitySubstitutionRatio = entitySubCount / tCounts.noun; | |
} | |
// estimate test readability using Flesch-Kincaid Grade Level test | |
// TODO short texts ... | |
if (/*(metrics.wordCount >= 100) &&*/ metrics.wordsPerSentence && metrics.syllablesPerWord) { | |
metrics.readability = 0.39 * metrics.wordsPerSentence + 11.8 * metrics.syllablesPerWord - 15.59; | |
} | |
// count number of stopwords | |
// stopwordRatio | |
/* TODO | |
+ Named-Entities (dynamic) !!! | |
? rare words / rareWordsRatio | |
// | |
# count number of stopwords | |
data['stopwords'] = [None] * len(tokens) | |
for idx, word in enumerate(words): | |
if word in stopset: | |
metrics['stopword_ratio'] += 1 | |
data['stopwords'][word2token_map[idx]] = True | |
else: | |
data['stopwords'][word2token_map[idx]] = False | |
if metrics['wordCount']: | |
metrics['stopword_ratio'] /= metrics['wordCount'] | |
# count rare words | |
if len(words): | |
metrics['rare_word_ratio'] = data['expected_word_frequencies'].count(0) / len(words) | |
else: | |
metrics['rare_word_ratio'] = 0 | |
# count word, bigram, and trigram frequencies | |
// ... | |
// ??? | |
# fix some verbs ending in -ing being counted as nouns | |
for idx, token in enumerate(tokens): | |
if (token[-3:] == 'ing') and (idx < len(tokens)) and (data['parts_of_speech'][idx+1] == 'IN'): | |
data['parts_of_speech'][idx] = 'VBG' | |
// ??? see below | |
# find auxiliary verbs | |
for i in range(verb_group_count): | |
verb_group_stack = [idx for idx in range(len(tokens)) if data['verb_groups'][idx] == i+1] | |
for j in verb_group_stack[:-1]: | |
auxiliary_verbs[j] = True | |
// ??? | |
data['weak_verbs'][idx] = (data['parts_of_speech'][idx][:2] == 'VB') and (data['stems'][idx] in dict_weak_verbs) | |
if data['weak_verbs'][idx] and auxiliary_verbs[idx]: | |
data['weak_verbs'][idx] = False | |
*/ | |
stems = null; | |
return metrics; | |
} | |
var toPercent = function(o) { | |
var percentMetrics = {}; | |
for (var k in o) { | |
percentMetrics[k.replace('Ratio', 'Percent')] = decimals( (k.indexOf('Ratio')<0) ? o[k] : ((o[k]) ? o[k]*100 : 0) ); | |
} | |
return percentMetrics; | |
} | |
var metrics = calculateMetrics(TEST); | |
console.log( metrics ); | |
console.log( toPercent(metrics) ); | |
/* appendix, reasoning | |
NN | |
"before a modal" //if it's before a modal verb, it's a noun -> lkjsdf would | |
"determiner-verb" //if it's after a determiner, it's not a verb -> the walk | |
"capitalised" //it has a capital and isn't first word | |
"need one verb" //if there no verb in the sentence, there needs to be. | |
VB | |
"after an adverb" //if it's after an adverb, it's not a noun -> quickly acked | |
"ed" //set ambiguous 'ed' endings as either verb/adjective | |
RB | |
"consecutive_adjectives" //no consecutive, unpunctuated adjectives -> real good | |
JJ | |
"copula-adjective" //copulas are followed by a determiner ("are a .."), or an adjective ("are good") | |
"copula-adverb-adjective" //copula, adverb, verb -> copula adverb adjective -> is very lkjsdf | |
UH | |
"wordless_string" //punctuation - like ' -- ' etc. | |
CD | |
"parsefloat" //see if it's a number | |
--- | |
lex | |
"lexicon" //known words list | |
parts_of_speech[wordnet_suffixes[suffix]] | |
"wordnet suffix" //suffix pos signals from wordnet | |
r | |
"regex suffix" // suffix regexes for words | |
// + last pass | |
sentence.tokens = sentence.tokens.map(function(token, i) { | |
var next = sentence.tokens[i + 1] | |
var prev = sentence.tokens[i - 1] | |
if (token.pos) { | |
//suggest noun after determiners (a|the), posessive pronouns (her|my|its) | |
if (token.pos.tag == "DT" || token.pos.tag == "PP") { | |
need = 'NN' | |
reason = token.pos.name | |
} | |
//suggest verb after personal pronouns (he|she|they), modal verbs (would|could|should) | |
if (token.pos.tag == "PRP" || token.pos.tag == "MD") { | |
need = 'VB' | |
reason = token.pos.name | |
} | |
} | |
if (need && !token.pos) { | |
token.pos = parts_of_speech[need] | |
token.pos_reason = "signal from " + reason | |
} | |
if (need == 'VB' && token.pos.parent == 'verb') { | |
need = null | |
} | |
if (need == 'NN' && token.pos.parent == 'noun') { | |
need = null | |
} | |
return token | |
}) | |
*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// nlp_comprimise by @spencermountain in 2014 | |
// most files are self-contained modules that optionally export for nodejs | |
// this file loads them all together | |
// if we're server-side, grab files, otherwise assume they're prepended already | |
if (typeof module !== "undefined" && module.exports) { | |
var parents = require("./src/parents/parents") | |
var sentence_parser = require('./src/methods/tokenization/sentence').sentences; | |
var tokenize = require('./src/methods/tokenization/tokenize').tokenize; | |
var ngram = require('./src/methods/tokenization/ngram').ngram; | |
//tokenize | |
var normalize = require('./src/methods/transliteration/unicode_normalisation') | |
var syllables = require('./src/methods/syllables/syllable'); | |
//localization | |
var local = require('./src/methods/localization/britishize') | |
var americanize = local.americanize; | |
var britishize = local.britishize; | |
//part of speech tagging | |
var pos = require('./src/pos'); | |
//named_entity_recognition | |
var spot = require('./src/spot'); | |
//weak verbs, vulgar words etc. TODO - goes to metrics ... | |
var bl = require('./src/data/blacklist'); | |
} | |
/// | |
// api | |
var nlp = { | |
noun: parents.noun, | |
adjective: parents.adjective, | |
verb: parents.verb, | |
adverb: parents.adverb, | |
value: parents.value, | |
sentences: sentence_parser, | |
ngram: ngram, | |
tokenize: tokenize, | |
americanize: americanize, | |
britishize: britishize, | |
syllables: syllables, | |
normalize: normalize.normalize, | |
denormalize: normalize.denormalize, | |
pos: pos, | |
spot: spot, | |
blacklist: bl | |
// tests: tests, | |
}; | |
//export it for server-side | |
if (typeof module !== "undefined" && module.exports) { | |
module.exports = nlp; | |
} | |
// bump bower | |
// git tag -a v0.3.5 -m "tag bower release" | |
// git push origin master --tags | |
// console.log( nlp.pos('she sells seashells by the seashore').sentences[0].negate().text() ) | |
// console.log( nlp.pos('i will slouch').to_past().text() ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment