Created
December 3, 2016 05:12
-
-
Save Daniel-Hug/8691324ff2b5c1f31e234f8b7370839a to your computer and use it in GitHub Desktop.
JS function: get words from sentence (handles much more than normal split on space)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// edge cases and false positives: | |
/* | |
fail: | |
crazy punctuation: interobang, etc. | |
Dr. Drake (doesn't include period in word) | |
pass: | |
Tennis, soccer, baseball, etc., are outdoor games. | |
fifty-five | |
He will win- it's obvious! | |
*/ | |
var getWords = (function() { | |
// word seperators: punctuation other than dash and period | |
var re = /[,\/#!?$%\^&\*;:{}=\_—`~() ]+/g; | |
return function getWords(phrase) { | |
phrase = phrase.toLowerCase(); | |
var words = phrase.split(re).map(function(word) { | |
// special handling of dash and period as they may appear mid-word | |
return word.slice(-1) === '-' ? word.slice(0,-1) : | |
word[0] === '-' ? word.slice(1) : | |
word.split('.').length > 3 ? word : | |
word.slice(-1) === '.' ? word.slice(0,-1) : word; | |
}); | |
// don't include last empty word resulting from closing sentence punctuation | |
return words[words.length - 1] === '' ? words.slice(0, -1) : words; | |
}; | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment