Created
March 3, 2018 18:18
-
-
Save trungkak/f505396b755e54c40094a7386aa75c1f to your computer and use it in GitHub Desktop.
Sentences splitting regex patterns
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Pattern pattern = Pattern.compile( | |
"# Match a sentence ending in punctuation or EOS.\n" + | |
"[^.!?\\s] # First char is non-punct, non-ws\n" + | |
"[^.!?]* # Greedily consume up to punctuation.\n" + | |
"(?: # Group for unrolling the loop.\n" + | |
" [.!?] # (special) inner punctuation ok if\n" + | |
" (?!['\"]?\\s|$) # not followed by ws or EOS.\n" + | |
" [^.!?]* # Greedily consume up to punctuation.\n" + | |
")* # Zero or more (special normal*)\n" + | |
"[.!?]? # Optional ending punctuation.\n" + | |
"['\"]? # Optional closing quote.\n" + | |
"(?=\\s|$)", | |
Pattern.MULTILINE | Pattern.COMMENTS); | |
Matcher reMatcher = pattern.matcher(text); | |
while (reMatcher.find()) { | |
sentences.add(new Sentence(reMatcher.group())); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment