lukeorland · December 19, 2015 01:29 · fmof · Jun 27, 2013
diff --git a/nltk_english_sentence_splitter.py b/nltk_english_sentence_splitter.py
 #!/usr/bin/env python
 """
 This script reads in lines of UTF-8 text from stdin,
 applies the NLTK English sentence tokenizer to the text, and
 prints out the split lines to stdout (in UTF-8), preserving runs of two or more
 newline characters.

 N.B. An assumption being made is that the NLTK English Punkt model is already
 installed in an appropriate location in the environment.
 """
 from __future__ import print_function
 import re
 import sys
 import nltk


 sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle').tokenize


 def mangle(text):
    text = re.sub(r'([MD]rs?[.]) ([A-Z])', r'\1_\2', text)
    text = re.sub(r'(Gov[.]) ([A-Z])', r'\1_\2', text)
    return text


 def unmangle(text):
    text = re.sub(r'([MD]rs?[.])_([A-Z])', r'\1 \2', text)
    text = re.sub(r'(Gov[.])_([A-Z])', r'\1 \2', text)
    return text


 def split_sentences(text):
    return [
        unmangle(sentence)
        for sentence
        in sentence_tokenizer(mangle(text), realign_boundaries=True)
    ]


 def pre_process_paragraphs(text):
    """
    Join consecutive lines with a space. Leave alone blank lines.
    """
    result = []
    paragraph_lines = []
    lines = text.split('\n')

    # Save all the beginning blank lines for the last step.
    beg_blanks = []
    while lines and lines[0] == '':
        beg_blanks.append(lines.pop(0))

    # Save all the ending blank lines for the last step.
    end_blanks = []
    while lines and lines[-1] == '':
        end_blanks.append(lines.pop())

    # Join consecutive lines with a space character.
    for line in lines:
        if not line:
            if paragraph_lines:
                result.append(' '.join(paragraph_lines))
                paragraph_lines = []
            result.append('')
        else:
            paragraph_lines.append(line)
    result.append(' '.join(paragraph_lines))

    # Preserve beginning and ending blank lines
    result = beg_blanks + result + end_blanks
    return '\n'.join(result)


 if __name__ == '__main__':
    """
    Read in lines of UTF-8 text from stdin.
    Apply the NLTK English sentence tokenizer to the text.
    Print out the split lines to stdout, preserving runs of two or more newline
    characters.
    """
    text = sys.stdin.read().decode('utf-8')
    text = pre_process_paragraphs(text)
    lines = text.split('\n')
    result = []
    for line in lines:
        if line:
            result.extend(split_sentences(line))
        else:
            result.append(line)
    print('\n'.join(result).encode('utf-8'), end='')


 # Unit tests #

 def test_pre_process_paragraphs_1():
    text = "1\n2\n\n3"
    expect = "1 2\n\n3"
    actual = pre_process_paragraphs(text)
    assert expect == actual

 def test_pre_process_paragraphs_beg_end_blanks():
    text = "\n1\n2\n\n3\n\n\n\n4\n\n"
    expect = "\n1 2\n\n3\n\n\n\n4\n\n"
    actual = pre_process_paragraphs(text)
    assert expect == actual

 def test_split_sentences():
    text = '''"A clam for supper? a cold clam; is THAT what you mean, Mrs. Hussey?" says I, "but that's a rather cold and clammy reception in the winter time, ain't it, Gov. Hussey?"'''
    expect = ['"A clam for supper?', 'a cold clam; is THAT what you mean, Mrs. Hussey?"', 'says I, "but that\'s a rather cold and clammy reception in the winter time, ain\'t it, Gov. Hussey?"']
    actual = split_sentences(text)
    assert expect == actual
	#!/usr/bin/env python
	"""
	This script reads in lines of UTF-8 text from stdin,
	applies the NLTK English sentence tokenizer to the text, and
	prints out the split lines to stdout (in UTF-8), preserving runs of two or more
	newline characters.

	N.B. An assumption being made is that the NLTK English Punkt model is already
	installed in an appropriate location in the environment.
	"""
	from __future__ import print_function
	import re
	import sys
	import nltk


	sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle').tokenize


	def mangle(text):
	text = re.sub(r'([MD]rs?[.]) ([A-Z])', r'\1_\2', text)
	text = re.sub(r'(Gov[.]) ([A-Z])', r'\1_\2', text)
	return text


	def unmangle(text):
	text = re.sub(r'([MD]rs?[.])_([A-Z])', r'\1 \2', text)
	text = re.sub(r'(Gov[.])_([A-Z])', r'\1 \2', text)
	return text


	def split_sentences(text):
	return [
	unmangle(sentence)
	for sentence
	in sentence_tokenizer(mangle(text), realign_boundaries=True)
	]


	def pre_process_paragraphs(text):
	"""
	Join consecutive lines with a space. Leave alone blank lines.
	"""
	result = []
	paragraph_lines = []
	lines = text.split('\n')

	# Save all the beginning blank lines for the last step.
	beg_blanks = []
	while lines and lines[0] == '':
	beg_blanks.append(lines.pop(0))

	# Save all the ending blank lines for the last step.
	end_blanks = []
	while lines and lines[-1] == '':
	end_blanks.append(lines.pop())

	# Join consecutive lines with a space character.
	for line in lines:
	if not line:
	if paragraph_lines:
	result.append(' '.join(paragraph_lines))
	paragraph_lines = []
	result.append('')
	else:
	paragraph_lines.append(line)
	result.append(' '.join(paragraph_lines))

	# Preserve beginning and ending blank lines
	result = beg_blanks + result + end_blanks
	return '\n'.join(result)


	if __name__ == '__main__':
	"""
	Read in lines of UTF-8 text from stdin.
	Apply the NLTK English sentence tokenizer to the text.
	Print out the split lines to stdout, preserving runs of two or more newline
	characters.
	"""
	text = sys.stdin.read().decode('utf-8')
	text = pre_process_paragraphs(text)
	lines = text.split('\n')
	result = []
	for line in lines:
	if line:
	result.extend(split_sentences(line))
	else:
	result.append(line)
	print('\n'.join(result).encode('utf-8'), end='')


	# Unit tests #

	def test_pre_process_paragraphs_1():
	text = "1\n2\n\n3"
	expect = "1 2\n\n3"
	actual = pre_process_paragraphs(text)
	assert expect == actual

	def test_pre_process_paragraphs_beg_end_blanks():
	text = "\n1\n2\n\n3\n\n\n\n4\n\n"
	expect = "\n1 2\n\n3\n\n\n\n4\n\n"
	actual = pre_process_paragraphs(text)
	assert expect == actual

	def test_split_sentences():
	text = '''"A clam for supper? a cold clam; is THAT what you mean, Mrs. Hussey?" says I, "but that's a rather cold and clammy reception in the winter time, ain't it, Gov. Hussey?"'''
	expect = ['"A clam for supper?', 'a cold clam; is THAT what you mean, Mrs. Hussey?"', 'says I, "but that\'s a rather cold and clammy reception in the winter time, ain\'t it, Gov. Hussey?"']
	actual = split_sentences(text)
	assert expect == actual