Created
June 10, 2011 13:29
-
-
Save raisch/1018823 to your computer and use it in GitHub Desktop.
Regular Expression Sentence Tokenizer (English)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// tokenize(str) | |
// extracts semantically useful tokens from a string containing English-language sentences | |
// @param {String} the string to tokenize | |
// @returns {Array} contains extracted tokens | |
function tokenize(str) { | |
var punct='\\['+ '\\!'+ '\\"'+ '\\#'+ '\\$'+ // since javascript does not | |
'\\%'+ '\\&'+ '\\\''+ '\\('+ '\\)'+ // support POSIX character | |
'\\*'+ '\\+'+ '\\,'+ '\\\\'+ '\\-'+ // classes, we'll need our | |
'\\.'+ '\\/'+ '\\:'+ '\\;'+ '\\<'+ // own version of [:punct:] | |
'\\='+ '\\>'+ '\\?'+ '\\@'+ '\\['+ | |
'\\]'+ '\\^'+ '\\_'+ '\\`'+ '\\{'+ | |
'\\|'+ '\\}'+ '\\~'+ '\\]', | |
re=new RegExp( // tokenizer | |
'\\s*'+ // discard possible leading whitespace | |
'('+ // start capture group #1 | |
'\\.{3}'+ // ellipsis (must appear before punct) | |
'|'+ // alternator | |
'\\w+\\-\\w+'+ // hyphenated words (must appear before punct) | |
'|'+ // alternator | |
'\\w+\'(?:\\w+)?'+ // compound words (must appear before punct) | |
'|'+ // alternator | |
'\\w+'+ // other words | |
'|'+ // alternator | |
'['+punct+']'+ // punct | |
')' // end capture group | |
); | |
// grep(ary[,filt]) - filters an array | |
// note: could use jQuery.grep() instead | |
// @param {Array} ary array of members to filter | |
// @param {Function} filt function to test truthiness of member, | |
// if omitted, "function(member){ if(member) return member; }" is assumed | |
// @returns {Array} all members of ary where result of filter is truthy | |
function grep(ary,filt) { | |
var result=[]; | |
for(var i=0,len=ary.length;i++<len;) { | |
var member=ary[i]||''; | |
if(filt && (typeof filt === 'Function') ? filt(member) : member) { | |
result.push(member); | |
} | |
} | |
return result; | |
} | |
return grep( str.split(re) ); // note: filter function omitted | |
// since all we need to test | |
// for is truthiness | |
} // end tokenize() |
also available as a fiddle
How would you change this to include punctuation with the word token? So:
[
'Here\'s',
'a',
'(good,',
'bad,',
'indifferent,',
'...)',
'example',
'sentence',
'to',
'be',
'used',
'in',
'this',
'test',
'of',
'English',
'language',
'"token-extraction".'
]
Technically this is a word tokenizer not a sentence tokenizer...
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage:
Produces: