Created
March 24, 2010 19:32
-
-
Save fosskers/342684 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#slightly modified | |
#made it part of a module for your using pleasure | |
module SentenceParser | |
#finds the earliest sentence-ending punctuation in the given String | |
#i realize this isn't pretty | |
#finds the earliest sentence-ending punctuation in the given String | |
def find_partition_pos(line) | |
found = false | |
pos = nil #the position of the valid sentence ending punctuation | |
puncs = %w[. ! ?] | |
puncs.each do |punc| | |
temp = line.index(punc) | |
if temp != nil and temp < pos | |
pos = temp | |
found = true | |
end | |
end | |
pos = nil if not found | |
return pos | |
end | |
def read_file(filename) | |
lines = [] #an array to store the parsed sentences | |
if File.exist? filename | |
file = File.open(filename, "r") | |
line = "" | |
until file.eof do | |
line << " " << file.gets.chomp.strip #appends next line to the incomplete sentence before it | |
pos = find_partition_pos(line) #finds the position of the earliest sentence-ending punctuation | |
unless pos == nil #extract a complete sentence | |
tokens = line.partition(line[pos]) | |
lines << (tokens[0] << tokens[1]) #fuse the punctuation back to the sentence first | |
line = tokens[2].lstrip #save the remaining line for the next pass | |
end #else, skip to the next line read | |
end #until | |
#get the stragglers. there may be sentences left over in 'line' even after file reading is done | |
done = false | |
until done do | |
pos = find_partition_pos(line) | |
unless pos == nil | |
if pos == line.size-1 #if the only valid punctuation appears at the end, there's only one sentence | |
lines << line | |
done = true | |
else #keep going | |
tokens = line.partition(line[pos]) | |
lines << (tokens[0] << tokens[1]) | |
line = tokens[2].lstrip | |
end | |
else #pos == nil because there's no sentence-ending punctuation left in 'line' | |
lines << line #prevents infinite loop if no punctuation in last line | |
done = true | |
end | |
end #until | |
puts lines #print a copy of the inputted text as a whole, split by sentences | |
else | |
puts "#{filename} is an invalid file." | |
return lines | |
end #read_file | |
end #SentenceParser |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment