Created
December 10, 2011 06:09
-
-
Save melborne/1454681 to your computer and use it in GitHub Desktop.
Class for building a word dictionary
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require_relative "word_dictionary" | |
| texts = %w(alices_adventures_in_wonderland.txt | |
| pride_and_prejudice.txt | |
| the_adventures_of_sherlock_holmes.txt | |
| frankenstein.txt | |
| hamlet.txt | |
| peter_pan.txt) | |
| bases = %w(english_literature.txt analyze_people_on_sight.txt) | |
| texts = texts.map { |text| WordDictionary.new(text, text) } | |
| base = bases.map { |base| WordDictionary.new(base, base) }.inject(:+) | |
| texts.each do |text| | |
| puts "---- #{text.name[/\w+/].split('_').map(&:capitalize).join(' ')} ----" | |
| puts text.uniq_words(40, base).map { |w,f| "#{w}:#{f}" } | |
| end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require "open-uri" | |
| module Enumerable | |
| def take_by(nth) | |
| sort_by { |elem| yield elem }.slice(0...nth) | |
| end | |
| end | |
| class WordDictionary | |
| include Enumerable | |
| attr_reader :name, :words | |
| def initialize(input, name= 'none', inner_call=false) | |
| input = input_to_string(input, inner_call) | |
| @words = input.downcase.scan(/[a-z']+/) | |
| @freq_dic = @words.inject(Hash.new(0)) { |dic, word| dic[word] += 1 ; dic } | |
| @name = name | |
| end | |
| def each | |
| @freq_dic.each { |elem| yield elem } | |
| end | |
| def top_by_frequency(nth, &blk) | |
| take_by_value(nth, lambda { |v| -v }, &blk) | |
| end | |
| def bottom_by_frequency(nth, &blk) | |
| take_by_value(nth, lambda { |v| v }, &blk) | |
| end | |
| def top_by_length(nth, &blk) | |
| list = take_by_key(nth, lambda { |word| -word.length }, &blk) | |
| list.map { |word, freq| [word, freq, word.length] } | |
| end | |
| def select(regexp) | |
| text = @freq_dic.select { |word, freq| word =~ regexp }.select { |word, freq| block_given? ? yield(freq) : freq }.map { |word, freq| "#{word} " * freq }.join(" ") | |
| WordDictionary.new(text, @name, true) | |
| end | |
| def to_s | |
| @freq_dic.to_s | |
| end | |
| def size | |
| @freq_dic.length | |
| end | |
| def +(other) | |
| arithmetics(:+, other) | |
| end | |
| def -(other) | |
| arithmetics(:-, other) | |
| end | |
| def &(other) | |
| arithmetics(:&, other) | |
| end | |
| def |(other) | |
| arithmetics(:|, other) | |
| end | |
| def uniq_words(nth, *base) | |
| base.inject(self) { |_self, b| _self - b.select(/./) { |freq| freq >= 10 } }.top_by_frequency(nth) | |
| end | |
| protected :words | |
| private | |
| def input_to_string(input, inner_call) | |
| case input | |
| when /^http/ | |
| begin | |
| open(input) { |f| return f.read } | |
| rescue Exception => e | |
| puts e | |
| exit | |
| end | |
| when String | |
| begin | |
| File.open(input, "r") { |f| return f.read } | |
| rescue | |
| STDERR.puts "Argument has assumed as a text string" unless inner_call | |
| input | |
| end | |
| when ARGF.class | |
| input.read | |
| else | |
| raise "Wrong argument. ARGF, file or string are acceptable." | |
| end | |
| end | |
| def take_by_value(nth, sort_opt, &blk) | |
| val = lambda { |key, val| val } | |
| take_by_key_or_val(nth, sort_opt, val, &blk) | |
| end | |
| def take_by_key(nth, sort_opt, &blk) | |
| key = lambda { |key, val| key } | |
| take_by_key_or_val(nth, sort_opt, key, &blk) | |
| end | |
| def take_by_key_or_val(nth, sort_opt, by) | |
| @freq_dic.select { |key, val| block_given? ? yield(val) : val }.take_by(nth) { |key, val| sort_opt[by[key, val]] } | |
| end | |
| def arithmetics(op, other) | |
| result = (@words.send op, other.words).join(" ") | |
| WordDictionary.new(result, '', true) | |
| end | |
| end | |
| def pretty_print(data) | |
| max_stars = 60 | |
| max_value = data.max_by { |word, freq| freq }.slice(1) | |
| data.each do |word, freq| | |
| stars = "*" * (max_stars * (freq/max_value.to_f)).ceil | |
| printf "%5d:%-5s %s\n", freq, word, stars | |
| end | |
| end | |
| if $0 == __FILE__ | |
| base = WordDictionary.new('public/base.txt') | |
| alice = WordDictionary.new('public/alice.txt', "Alice's Adventures in Wonderland") | |
| jp_history = WordDictionary.new('public/japanese_history.txt') | |
| p alice.uniq_words(40, base) | |
| p jp_history.uniq_words(40, base) | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment