Created
March 24, 2010 18:47
-
-
Save zach-klippenstein/342625 to your computer and use it in GitHub Desktop.
Class for parsing sentences from various sources (strings, streams, etc.)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Demonstrates how to use the SentenceParser class. | |
require 'sentenceparser' | |
parser = SentenceParser.new | |
# Add a string as a source | |
parser << "This is a string. It has multiple sentences." | |
parser << "These are" << "a few strings, splitting" << "sentences across them." | |
# Add a stream as a source | |
parser << $stdin | |
parser.each do |sentence| | |
puts "sentence='#{sentence}'" | |
# Could add other sources while processing | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
require 'pathname' | |
require 'sentenceparser' | |
parser = SentenceParser.new | |
if ARGV.length < 1 | |
puts "usage: parsefiles.rb file ..." | |
puts " If - is specified, uses stdin." | |
exit 1 | |
end | |
ARGV.each do |arg| | |
if arg == '-' | |
parser << $stdin | |
else | |
parser << Pathname.new(arg) | |
end | |
end | |
parser.each { |sentence| puts sentence } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Class for parsing sentences from various sources. | |
require 'pathname' | |
class SentenceParser | |
SENTENCE_REGEX=/\s?([^\.\?!]*[\.\?!]["']?)(\s|$)/ | |
def initialize | |
@sentences = [] # Queue of already-parsed sentences (as strings) | |
@sources = [] # Queue of sources. Currently supported: String, IO | |
end | |
# Add a source to the parser | |
def << (source) | |
@sources << source | |
return self | |
end | |
def each(&block) | |
nextSentence = self.next | |
until nextSentence.nil? do | |
yield nextSentence | |
nextSentence = self.next | |
end | |
end | |
def next | |
source = @sources[0] | |
sentence = nil | |
# If the current source has been depleted, get | |
# the next one | |
while source.nil? and not @sources.empty? | |
@sources.shift | |
source = @sources[0] | |
end | |
# Parse the next sentence from the source | |
if source.is_a?(String) | |
if source.empty? | |
@sources.shift | |
sentence = self.next | |
else | |
# Aggregate all the string sources to increase probability of | |
# getting a whole sentence. | |
@sources.shift | |
until strContainsEOS(source) or not @sources[0].is_a?(String) | |
source << ' ' << @sources.shift | |
end | |
@sources.unshift(source) | |
# Remove newlines by replacing them with spaces | |
source.gsub!(/\n+/, ' ') | |
# Remove duplicate whitespace by | |
source.gsub!(/\s+/, ' ') | |
# Parse the next sentence | |
match = source.match(SENTENCE_REGEX) | |
# No sentence end in available strings, so prepend whatever | |
# we have onto whatever the next source returns. | |
if match.nil? | |
@sources.shift | |
sentence = self.next | |
# Check for last available sentence before attempting to | |
# concatenate objects. | |
if sentence.nil? | |
sentence = source | |
else | |
sentence = source + sentence | |
end | |
else | |
# Pull the sentence out of the source | |
sentence = source[0, match.end(1)] | |
# Delete the consumed sentence from the source | |
source[0, match.end(1)] = '' | |
end | |
end | |
# Parse the next sentence from an input stream | |
elsif source.is_a?(IO) | |
line = nil | |
block = "" | |
foundSentenceEnd = false | |
# Loop while we have a valid stream and haven't found the end | |
# of a sentence. | |
while source.is_a?(IO) and not strContainsEOS(block) | |
line = source.gets | |
# EOF, get next source | |
if line.nil? | |
@sources.shift | |
source = @sources[0] | |
# Add the line to the string to be analyzed | |
else | |
block << line | |
end | |
end | |
# If we found any text at all, add it to be processed next, and | |
# recurse to handle as a string. | |
unless block.empty? | |
@sources.unshift(block) | |
sentence = self.next | |
end | |
# Pathname sources are filenames, so treat like files | |
elsif source.is_a?(Pathname) | |
file = File.new(source, "r") | |
unless file.nil? | |
# Remove pathname source and replace with the opened file | |
@sources.shift | |
@sources.unshift(file) | |
end | |
sentence = self.next | |
# Source type is unknown, just use it as a string | |
else | |
@sources.unshift(@sources.shift.to_s) | |
end | |
# Check for abbreviations | |
if sentence =~ /\.\w.$/ | |
nextSentence = self.next | |
sentence << ' ' << nextSentence unless nextSentence.nil? | |
end | |
# Enqueue the parsed sentence and dequeue | |
# the next sentence to process | |
@sentences << sentence.strip unless sentence.nil? or sentence.empty? | |
sentence = @sentences.shift | |
return sentence | |
end | |
private | |
def strContainsEOS(str) | |
return true if str =~ SENTENCE_REGEX | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment