Last active
December 19, 2015 01:29
-
-
Save lukeorland/5876303 to your computer and use it in GitHub Desktop.
This script reads in lines of UTF-8 text from stdin, applies the NLTK English sentence tokenizer to the text, and prints out the split lines to stdout, preserving runs of two or more newline characters. N.B. An assumption being made is that the NLTK English Punkt model is already installed in an appropriate location in the environment.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
This script reads in lines of UTF-8 text from stdin, | |
applies the NLTK English sentence tokenizer to the text, and | |
prints out the split lines to stdout (in UTF-8), preserving runs of two or more | |
newline characters. | |
N.B. An assumption being made is that the NLTK English Punkt model is already | |
installed in an appropriate location in the environment. | |
""" | |
from __future__ import print_function | |
import re | |
import sys | |
import nltk | |
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle').tokenize | |
def mangle(text): | |
text = re.sub(r'([MD]rs?[.]) ([A-Z])', r'\1_\2', text) | |
text = re.sub(r'(Gov[.]) ([A-Z])', r'\1_\2', text) | |
return text | |
def unmangle(text): | |
text = re.sub(r'([MD]rs?[.])_([A-Z])', r'\1 \2', text) | |
text = re.sub(r'(Gov[.])_([A-Z])', r'\1 \2', text) | |
return text | |
def split_sentences(text): | |
return [ | |
unmangle(sentence) | |
for sentence | |
in sentence_tokenizer(mangle(text), realign_boundaries=True) | |
] | |
def pre_process_paragraphs(text): | |
""" | |
Join consecutive lines with a space. Leave alone blank lines. | |
""" | |
result = [] | |
paragraph_lines = [] | |
lines = text.split('\n') | |
# Save all the beginning blank lines for the last step. | |
beg_blanks = [] | |
while lines and lines[0] == '': | |
beg_blanks.append(lines.pop(0)) | |
# Save all the ending blank lines for the last step. | |
end_blanks = [] | |
while lines and lines[-1] == '': | |
end_blanks.append(lines.pop()) | |
# Join consecutive lines with a space character. | |
for line in lines: | |
if not line: | |
if paragraph_lines: | |
result.append(' '.join(paragraph_lines)) | |
paragraph_lines = [] | |
result.append('') | |
else: | |
paragraph_lines.append(line) | |
result.append(' '.join(paragraph_lines)) | |
# Preserve beginning and ending blank lines | |
result = beg_blanks + result + end_blanks | |
return '\n'.join(result) | |
if __name__ == '__main__': | |
""" | |
Read in lines of UTF-8 text from stdin. | |
Apply the NLTK English sentence tokenizer to the text. | |
Print out the split lines to stdout, preserving runs of two or more newline | |
characters. | |
""" | |
text = sys.stdin.read().decode('utf-8') | |
text = pre_process_paragraphs(text) | |
lines = text.split('\n') | |
result = [] | |
for line in lines: | |
if line: | |
result.extend(split_sentences(line)) | |
else: | |
result.append(line) | |
print('\n'.join(result).encode('utf-8'), end='') | |
# Unit tests # | |
def test_pre_process_paragraphs_1(): | |
text = "1\n2\n\n3" | |
expect = "1 2\n\n3" | |
actual = pre_process_paragraphs(text) | |
assert expect == actual | |
def test_pre_process_paragraphs_beg_end_blanks(): | |
text = "\n1\n2\n\n3\n\n\n\n4\n\n" | |
expect = "\n1 2\n\n3\n\n\n\n4\n\n" | |
actual = pre_process_paragraphs(text) | |
assert expect == actual | |
def test_split_sentences(): | |
text = '''"A clam for supper? a cold clam; is THAT what you mean, Mrs. Hussey?" says I, "but that's a rather cold and clammy reception in the winter time, ain't it, Gov. Hussey?"''' | |
expect = ['"A clam for supper?', 'a cold clam; is THAT what you mean, Mrs. Hussey?"', 'says I, "but that\'s a rather cold and clammy reception in the winter time, ain\'t it, Gov. Hussey?"'] | |
actual = split_sentences(text) | |
assert expect == actual |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I'd be fine adding this as an option to https://github.com/hltcoe/anno-pipeline, though it'd need to handle SGML tags (see https://github.com/hltcoe/anno-pipeline/blob/master/scripts/split_sentences.py#L41)