Skip to content

Instantly share code, notes, and snippets.

@melborne
Created December 10, 2011 06:09
Show Gist options
  • Select an option

  • Save melborne/1454681 to your computer and use it in GitHub Desktop.

Select an option

Save melborne/1454681 to your computer and use it in GitHub Desktop.
Class for building a word dictionary
require_relative "word_dictionary"
texts = %w(alices_adventures_in_wonderland.txt
pride_and_prejudice.txt
the_adventures_of_sherlock_holmes.txt
frankenstein.txt
hamlet.txt
peter_pan.txt)
bases = %w(english_literature.txt analyze_people_on_sight.txt)
texts = texts.map { |text| WordDictionary.new(text, text) }
base = bases.map { |base| WordDictionary.new(base, base) }.inject(:+)
texts.each do |text|
puts "---- #{text.name[/\w+/].split('_').map(&:capitalize).join(' ')} ----"
puts text.uniq_words(40, base).map { |w,f| "#{w}:#{f}" }
end
require "open-uri"
module Enumerable
def take_by(nth)
sort_by { |elem| yield elem }.slice(0...nth)
end
end
class WordDictionary
include Enumerable
attr_reader :name, :words
def initialize(input, name= 'none', inner_call=false)
input = input_to_string(input, inner_call)
@words = input.downcase.scan(/[a-z']+/)
@freq_dic = @words.inject(Hash.new(0)) { |dic, word| dic[word] += 1 ; dic }
@name = name
end
def each
@freq_dic.each { |elem| yield elem }
end
def top_by_frequency(nth, &blk)
take_by_value(nth, lambda { |v| -v }, &blk)
end
def bottom_by_frequency(nth, &blk)
take_by_value(nth, lambda { |v| v }, &blk)
end
def top_by_length(nth, &blk)
list = take_by_key(nth, lambda { |word| -word.length }, &blk)
list.map { |word, freq| [word, freq, word.length] }
end
def select(regexp)
text = @freq_dic.select { |word, freq| word =~ regexp }.select { |word, freq| block_given? ? yield(freq) : freq }.map { |word, freq| "#{word} " * freq }.join(" ")
WordDictionary.new(text, @name, true)
end
def to_s
@freq_dic.to_s
end
def size
@freq_dic.length
end
def +(other)
arithmetics(:+, other)
end
def -(other)
arithmetics(:-, other)
end
def &(other)
arithmetics(:&, other)
end
def |(other)
arithmetics(:|, other)
end
def uniq_words(nth, *base)
base.inject(self) { |_self, b| _self - b.select(/./) { |freq| freq >= 10 } }.top_by_frequency(nth)
end
protected :words
private
def input_to_string(input, inner_call)
case input
when /^http/
begin
open(input) { |f| return f.read }
rescue Exception => e
puts e
exit
end
when String
begin
File.open(input, "r") { |f| return f.read }
rescue
STDERR.puts "Argument has assumed as a text string" unless inner_call
input
end
when ARGF.class
input.read
else
raise "Wrong argument. ARGF, file or string are acceptable."
end
end
def take_by_value(nth, sort_opt, &blk)
val = lambda { |key, val| val }
take_by_key_or_val(nth, sort_opt, val, &blk)
end
def take_by_key(nth, sort_opt, &blk)
key = lambda { |key, val| key }
take_by_key_or_val(nth, sort_opt, key, &blk)
end
def take_by_key_or_val(nth, sort_opt, by)
@freq_dic.select { |key, val| block_given? ? yield(val) : val }.take_by(nth) { |key, val| sort_opt[by[key, val]] }
end
def arithmetics(op, other)
result = (@words.send op, other.words).join(" ")
WordDictionary.new(result, '', true)
end
end
def pretty_print(data)
max_stars = 60
max_value = data.max_by { |word, freq| freq }.slice(1)
data.each do |word, freq|
stars = "*" * (max_stars * (freq/max_value.to_f)).ceil
printf "%5d:%-5s %s\n", freq, word, stars
end
end
if $0 == __FILE__
base = WordDictionary.new('public/base.txt')
alice = WordDictionary.new('public/alice.txt', "Alice's Adventures in Wonderland")
jp_history = WordDictionary.new('public/japanese_history.txt')
p alice.uniq_words(40, base)
p jp_history.uniq_words(40, base)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment