Skip to content

Instantly share code, notes, and snippets.

@lukeorland
Last active December 19, 2015 01:29
Show Gist options
  • Save lukeorland/5876303 to your computer and use it in GitHub Desktop.
Save lukeorland/5876303 to your computer and use it in GitHub Desktop.
This script reads in lines of UTF-8 text from stdin, applies the NLTK English sentence tokenizer to the text, and prints out the split lines to stdout, preserving runs of two or more newline characters. N.B. An assumption being made is that the NLTK English Punkt model is already installed in an appropriate location in the environment.
#!/usr/bin/env python
"""
This script reads in lines of UTF-8 text from stdin,
applies the NLTK English sentence tokenizer to the text, and
prints out the split lines to stdout (in UTF-8), preserving runs of two or more
newline characters.
N.B. An assumption being made is that the NLTK English Punkt model is already
installed in an appropriate location in the environment.
"""
from __future__ import print_function
import re
import sys
import nltk
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle').tokenize
def mangle(text):
text = re.sub(r'([MD]rs?[.]) ([A-Z])', r'\1_\2', text)
text = re.sub(r'(Gov[.]) ([A-Z])', r'\1_\2', text)
return text
def unmangle(text):
text = re.sub(r'([MD]rs?[.])_([A-Z])', r'\1 \2', text)
text = re.sub(r'(Gov[.])_([A-Z])', r'\1 \2', text)
return text
def split_sentences(text):
return [
unmangle(sentence)
for sentence
in sentence_tokenizer(mangle(text), realign_boundaries=True)
]
def pre_process_paragraphs(text):
"""
Join consecutive lines with a space. Leave alone blank lines.
"""
result = []
paragraph_lines = []
lines = text.split('\n')
# Save all the beginning blank lines for the last step.
beg_blanks = []
while lines and lines[0] == '':
beg_blanks.append(lines.pop(0))
# Save all the ending blank lines for the last step.
end_blanks = []
while lines and lines[-1] == '':
end_blanks.append(lines.pop())
# Join consecutive lines with a space character.
for line in lines:
if not line:
if paragraph_lines:
result.append(' '.join(paragraph_lines))
paragraph_lines = []
result.append('')
else:
paragraph_lines.append(line)
result.append(' '.join(paragraph_lines))
# Preserve beginning and ending blank lines
result = beg_blanks + result + end_blanks
return '\n'.join(result)
if __name__ == '__main__':
"""
Read in lines of UTF-8 text from stdin.
Apply the NLTK English sentence tokenizer to the text.
Print out the split lines to stdout, preserving runs of two or more newline
characters.
"""
text = sys.stdin.read().decode('utf-8')
text = pre_process_paragraphs(text)
lines = text.split('\n')
result = []
for line in lines:
if line:
result.extend(split_sentences(line))
else:
result.append(line)
print('\n'.join(result).encode('utf-8'), end='')
# Unit tests #
def test_pre_process_paragraphs_1():
text = "1\n2\n\n3"
expect = "1 2\n\n3"
actual = pre_process_paragraphs(text)
assert expect == actual
def test_pre_process_paragraphs_beg_end_blanks():
text = "\n1\n2\n\n3\n\n\n\n4\n\n"
expect = "\n1 2\n\n3\n\n\n\n4\n\n"
actual = pre_process_paragraphs(text)
assert expect == actual
def test_split_sentences():
text = '''"A clam for supper? a cold clam; is THAT what you mean, Mrs. Hussey?" says I, "but that's a rather cold and clammy reception in the winter time, ain't it, Gov. Hussey?"'''
expect = ['"A clam for supper?', 'a cold clam; is THAT what you mean, Mrs. Hussey?"', 'says I, "but that\'s a rather cold and clammy reception in the winter time, ain\'t it, Gov. Hussey?"']
actual = split_sentences(text)
assert expect == actual
@fmof
Copy link

fmof commented Jun 27, 2013

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment