ashaw · July 26, 2012 15:37
diff --git a/tokenize.rb b/tokenize.rb
 def tokenize(text)
  word = /([a-z][a-z'\-]+[a-z]+)/
  stopwords = ["the","and","to","of","a","i","in","was","he","that","it","his","her","you","as","had","with","for","she","not","at","but","be","my","on","have","him","is","said","me","which","by","so","this","all","from","they","no","were","if","would","or","when","what","there","been","one","could","very","an"]

  text.tr('’', "'").       # Normalize apostrophe
    downcase.              # Normalize
    scan(word).            # Tokens
    to_a.flatten.          # Flat array of matches
    reject {|w| stopwords.include? w } # Remove stopwords
 end
	def tokenize(text)
	word = /([a-z][a-z'\-]+[a-z]+)/
	stopwords = ["the","and","to","of","a","i","in","was","he","that","it","his","her","you","as","had","with","for","she","not","at","but","be","my","on","have","him","is","said","me","which","by","so","this","all","from","they","no","were","if","would","or","when","what","there","been","one","could","very","an"]

	text.tr('’', "'"). # Normalize apostrophe
	downcase. # Normalize
	scan(word). # Tokens
	to_a.flatten. # Flat array of matches
	reject {\|w\| stopwords.include? w } # Remove stopwords
	end