Eunoia · January 3, 2013 01:19
diff --git a/word_hash.rb b/word_hash.rb
 # Author::    Lucas Carlson  (mailto:[email protected])
 # Copyright:: Copyright (c) 2005 Lucas Carlson
 # License::   LGPL

 # These are extensions to the String class to provide convenience 
 # methods for the Classifier package.
 class String
  
  # Removes common punctuation symbols, returning a new string. 
  # E.g.,
  #   "Hello (greeting's), with {braces} < >...?".without_punctuation
  #   => "Hello  greetings   with  braces         "
  def without_punctuation
    tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
  end
  
  # Return a Hash of strings => ints. Each word in the string is stemmed,
  # interned, and indexes to its frequency in the document.  
  def word_hash
 	  #word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
 	  ls = self.split
 	  len = ls.length
 	  x=0
 	  arr = []
 	  while(x<len-1) do
 	    arr<<(ls[x]+" "+ls[x+1])
 	    x+=1;
    end
    word_hash_for_words(arr+gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
 	end

 	# Return a word hash without extra punctuation or short symbols, just stemmed words
 	def clean_word_hash
 		word_hash_for_words gsub(/[^\w\s]/,"").split
 	end
 	
 	private
 	
 	def word_hash_for_words(words)
 		d = Hash.new
 		words.each do |word|
 			word.downcase! if word =~ /[\w]+/
 			key = word.intern
 		#	key = word.stem.intern
 			if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
 				d[key] ||= 0
 				d[key] += 1
 			end
 		end
 		return d
 	end
 	CORPUS_SKIP_WORDS = [
 =begin
      "a",
      "again",
      "all",
      "along",
      "are",
      "also",
      "an",
      "and",
      "as",
      "at",
      "but",
      "by",
      "came",
      "can",
      "cant",
      "couldnt",
      "did",
      "didn",
      "didnt",
      "do",
      "doesnt",
      "dont",
      "ever",
      "first",
      "from",
      "have",
      "her",
      "here",
      "him",
      "how",
      "i",
      "if",
      "in",
      "into",
      "is",
      "isnt",
      "it",
      "itll",
      "just",
      "last",
      "least",
      "like",
      "most",
      "my",
      "new",
      "no",
      "not",
      "now",
      "of",
      "on",
      "or",
      "should",
      "sinc",
      "so",
      "some",
      "th",
      "than",
      "this",
      "that",
      "the",
      "their",
      "then",
      "those",
      "to",
      "told",
      "too",
      "true",
      "try",
      "until",
      "url",
      "us",
      "were",
      "when",
      "whether",
      "while",
      "with",
      "within",
      "yes",
      "you",
      "youll",
 =end

      ]
 end
	# Author:: Lucas Carlson (mailto:[email protected])
	# Copyright:: Copyright (c) 2005 Lucas Carlson
	# License:: LGPL

	# These are extensions to the String class to provide convenience
	# methods for the Classifier package.
	class String

	# Removes common punctuation symbols, returning a new string.
	# E.g.,
	# "Hello (greeting's), with {braces} < >...?".without_punctuation
	# => "Hello greetings with braces "
	def without_punctuation
	tr( ',?.!;:"@#$%^&*()_=+[]{}\\|<>/`~', " " ) .tr( "'\-", "")
	end

	# Return a Hash of strings => ints. Each word in the string is stemmed,
	# interned, and indexes to its frequency in the document.
	def word_hash
	#word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
	ls = self.split
	len = ls.length
	x=0
	arr = []
	while(x<len-1) do
	arr<<(ls[x]+" "+ls[x+1])
	x+=1;
	end
	word_hash_for_words(arr+gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
	end

	# Return a word hash without extra punctuation or short symbols, just stemmed words
	def clean_word_hash
	word_hash_for_words gsub(/[^\w\s]/,"").split
	end

	private

	def word_hash_for_words(words)
	d = Hash.new
	words.each do \|word\|
	word.downcase! if word =~ /[\w]+/
	key = word.intern
	# key = word.stem.intern
	if word =~ /[^\w]/ \|\| ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
	d[key] \|\|= 0
	d[key] += 1
	end
	end
	return d
	end
	CORPUS_SKIP_WORDS = [
	=begin
	"a",
	"again",
	"all",
	"along",
	"are",
	"also",
	"an",
	"and",
	"as",
	"at",
	"but",
	"by",
	"came",
	"can",
	"cant",
	"couldnt",
	"did",
	"didn",
	"didnt",
	"do",
	"doesnt",
	"dont",
	"ever",
	"first",
	"from",
	"have",
	"her",
	"here",
	"him",
	"how",
	"i",
	"if",
	"in",
	"into",
	"is",
	"isnt",
	"it",
	"itll",
	"just",
	"last",
	"least",
	"like",
	"most",
	"my",
	"new",
	"no",
	"not",
	"now",
	"of",
	"on",
	"or",
	"should",
	"sinc",
	"so",
	"some",
	"th",
	"than",
	"this",
	"that",
	"the",
	"their",
	"then",
	"those",
	"to",
	"told",
	"too",
	"true",
	"try",
	"until",
	"url",
	"us",
	"were",
	"when",
	"whether",
	"while",
	"with",
	"within",
	"yes",
	"you",
	"youll",
	=end

	]
	end