Created
January 3, 2013 01:19
-
-
Save Eunoia/4440017 to your computer and use it in GitHub Desktop.
My version of word_hash.rb I use for the classifier(https://github.com/cardmagic/classifier). It does not remove stop words, nor does it stem words. It also adds bi-grams to the bag of words(lines 21-28). This file lets the classifier look at the corpus more holistically, acknowledging that a corpus is not necessarily a bag of words, but that wo…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author:: Lucas Carlson (mailto:[email protected]) | |
# Copyright:: Copyright (c) 2005 Lucas Carlson | |
# License:: LGPL | |
# These are extensions to the String class to provide convenience | |
# methods for the Classifier package. | |
class String | |
# Removes common punctuation symbols, returning a new string. | |
# E.g., | |
# "Hello (greeting's), with {braces} < >...?".without_punctuation | |
# => "Hello greetings with braces " | |
def without_punctuation | |
tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "") | |
end | |
# Return a Hash of strings => ints. Each word in the string is stemmed, | |
# interned, and indexes to its frequency in the document. | |
def word_hash | |
#word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split) | |
ls = self.split | |
len = ls.length | |
x=0 | |
arr = [] | |
while(x<len-1) do | |
arr<<(ls[x]+" "+ls[x+1]) | |
x+=1; | |
end | |
word_hash_for_words(arr+gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split) | |
end | |
# Return a word hash without extra punctuation or short symbols, just stemmed words | |
def clean_word_hash | |
word_hash_for_words gsub(/[^\w\s]/,"").split | |
end | |
private | |
def word_hash_for_words(words) | |
d = Hash.new | |
words.each do |word| | |
word.downcase! if word =~ /[\w]+/ | |
key = word.intern | |
# key = word.stem.intern | |
if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2 | |
d[key] ||= 0 | |
d[key] += 1 | |
end | |
end | |
return d | |
end | |
CORPUS_SKIP_WORDS = [ | |
=begin | |
"a", | |
"again", | |
"all", | |
"along", | |
"are", | |
"also", | |
"an", | |
"and", | |
"as", | |
"at", | |
"but", | |
"by", | |
"came", | |
"can", | |
"cant", | |
"couldnt", | |
"did", | |
"didn", | |
"didnt", | |
"do", | |
"doesnt", | |
"dont", | |
"ever", | |
"first", | |
"from", | |
"have", | |
"her", | |
"here", | |
"him", | |
"how", | |
"i", | |
"if", | |
"in", | |
"into", | |
"is", | |
"isnt", | |
"it", | |
"itll", | |
"just", | |
"last", | |
"least", | |
"like", | |
"most", | |
"my", | |
"new", | |
"no", | |
"not", | |
"now", | |
"of", | |
"on", | |
"or", | |
"should", | |
"sinc", | |
"so", | |
"some", | |
"th", | |
"than", | |
"this", | |
"that", | |
"the", | |
"their", | |
"then", | |
"those", | |
"to", | |
"told", | |
"too", | |
"true", | |
"try", | |
"until", | |
"url", | |
"us", | |
"were", | |
"when", | |
"whether", | |
"while", | |
"with", | |
"within", | |
"yes", | |
"you", | |
"youll", | |
=end | |
] | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment