melborne · December 10, 2011 06:09
diff --git a/uniq_words_for_wordle.rb b/uniq_words_for_wordle.rb
 require_relative "word_dictionary"

 texts = %w(alices_adventures_in_wonderland.txt
           pride_and_prejudice.txt
           the_adventures_of_sherlock_holmes.txt
           frankenstein.txt
           hamlet.txt
           peter_pan.txt)
 bases = %w(english_literature.txt analyze_people_on_sight.txt)

 texts = texts.map { |text| WordDictionary.new(text, text) }
 base = bases.map { |base| WordDictionary.new(base, base) }.inject(:+)

 texts.each do |text|
  puts "---- #{text.name[/\w+/].split('_').map(&:capitalize).join(' ')} ----"
  puts text.uniq_words(40, base).map { |w,f| "#{w}:#{f}" }
 end
diff --git a/word_dictionary.rb b/word_dictionary.rb
 require "open-uri"
 module Enumerable
  def take_by(nth)
    sort_by { |elem| yield elem }.slice(0...nth)
  end
 end

 class WordDictionary
  include Enumerable
  attr_reader :name, :words

  def initialize(input, name= 'none', inner_call=false)
    input = input_to_string(input, inner_call)
    @words = input.downcase.scan(/[a-z']+/)
    @freq_dic = @words.inject(Hash.new(0)) { |dic, word| dic[word] += 1 ; dic }
    @name = name
  end

  def each
    @freq_dic.each { |elem| yield elem }
  end

  def top_by_frequency(nth, &blk)
    take_by_value(nth, lambda { |v| -v }, &blk)
  end

  def bottom_by_frequency(nth, &blk)
    take_by_value(nth, lambda { |v| v }, &blk)
  end

  def top_by_length(nth, &blk)
    list = take_by_key(nth, lambda { |word| -word.length }, &blk)
    list.map { |word, freq| [word, freq, word.length] }
  end

  def select(regexp)
    text = @freq_dic.select { |word, freq| word =~ regexp }.select { |word, freq| block_given? ? yield(freq) : freq }.map { |word, freq| "#{word} " * freq }.join(" ")
    WordDictionary.new(text, @name, true)
  end

  def to_s
    @freq_dic.to_s
  end
  
  def size
    @freq_dic.length
  end
  
  def +(other)
    arithmetics(:+, other)
  end
  
  def -(other)
    arithmetics(:-, other)
  end
  
  def &(other)
    arithmetics(:&, other)
  end
  
  def |(other)
    arithmetics(:|, other)
  end
  
  def uniq_words(nth, *base)
    base.inject(self) { |_self, b| _self - b.select(/./) { |freq| freq >= 10 }  }.top_by_frequency(nth)
  end

  protected :words 
  private
  def input_to_string(input, inner_call)
   case input
   when /^http/
     begin
       open(input) { |f| return f.read }
     rescue Exception => e
       puts e
       exit
     end
   when String
     begin
       File.open(input, "r") { |f| return f.read }
     rescue
       STDERR.puts "Argument has assumed as a text string" unless inner_call
       input
     end
   when ARGF.class
     input.read
   else
     raise "Wrong argument. ARGF, file or string are acceptable."
   end
  end
 
  def take_by_value(nth, sort_opt, &blk)
    val = lambda { |key, val| val }
    take_by_key_or_val(nth, sort_opt, val, &blk)
  end

  def take_by_key(nth, sort_opt, &blk)
    key = lambda { |key, val| key }
    take_by_key_or_val(nth, sort_opt, key, &blk)
  end

  def take_by_key_or_val(nth, sort_opt, by)
    @freq_dic.select { |key, val| block_given? ? yield(val) : val }.take_by(nth) { |key, val| sort_opt[by[key, val]] }
  end
  
  def arithmetics(op, other)
    result = (@words.send op, other.words).join(" ")
    WordDictionary.new(result, '', true)
  end
 end

 def pretty_print(data)
  max_stars = 60
  max_value = data.max_by { |word, freq| freq }.slice(1)
  data.each do |word, freq|
    stars = "*" * (max_stars * (freq/max_value.to_f)).ceil
    printf "%5d:%-5s %s\n", freq, word, stars
  end
 end
 
 if $0 == __FILE__
  base = WordDictionary.new('public/base.txt')
  alice = WordDictionary.new('public/alice.txt', "Alice's Adventures in Wonderland")
  jp_history = WordDictionary.new('public/japanese_history.txt')
  p alice.uniq_words(40, base)
  p jp_history.uniq_words(40, base)
 end
	require_relative "word_dictionary"

	texts = %w(alices_adventures_in_wonderland.txt
	pride_and_prejudice.txt
	the_adventures_of_sherlock_holmes.txt
	frankenstein.txt
	hamlet.txt
	peter_pan.txt)
	bases = %w(english_literature.txt analyze_people_on_sight.txt)

	texts = texts.map { \|text\| WordDictionary.new(text, text) }
	base = bases.map { \|base\| WordDictionary.new(base, base) }.inject(:+)

	texts.each do \|text\|
	puts "---- #{text.name[/\w+/].split('_').map(&:capitalize).join(' ')} ----"
	puts text.uniq_words(40, base).map { \|w,f\| "#{w}:#{f}" }
	end
	require "open-uri"
	module Enumerable
	def take_by(nth)
	sort_by { \|elem\| yield elem }.slice(0...nth)
	end
	end

	class WordDictionary
	include Enumerable
	attr_reader :name, :words

	def initialize(input, name= 'none', inner_call=false)
	input = input_to_string(input, inner_call)
	@words = input.downcase.scan(/[a-z']+/)
	@freq_dic = @words.inject(Hash.new(0)) { \|dic, word\| dic[word] += 1 ; dic }
	@name = name
	end

	def each
	@freq_dic.each { \|elem\| yield elem }
	end

	def top_by_frequency(nth, &blk)
	take_by_value(nth, lambda { \|v\| -v }, &blk)
	end

	def bottom_by_frequency(nth, &blk)
	take_by_value(nth, lambda { \|v\| v }, &blk)
	end

	def top_by_length(nth, &blk)
	list = take_by_key(nth, lambda { \|word\| -word.length }, &blk)
	list.map { \|word, freq\| [word, freq, word.length] }
	end

	def select(regexp)
	text = @freq_dic.select { \|word, freq\| word =~ regexp }.select { \|word, freq\| block_given? ? yield(freq) : freq }.map { \|word, freq\| "#{word} " * freq }.join(" ")
	WordDictionary.new(text, @name, true)
	end

	def to_s
	@freq_dic.to_s
	end

	def size
	@freq_dic.length
	end

	def +(other)
	arithmetics(:+, other)
	end

	def -(other)
	arithmetics(:-, other)
	end

	def &(other)
	arithmetics(:&, other)
	end

	def \|(other)
	arithmetics(:\|, other)
	end

	def uniq_words(nth, *base)
	base.inject(self) { \|_self, b\| _self - b.select(/./) { \|freq\| freq >= 10 } }.top_by_frequency(nth)
	end

	protected :words
	private
	def input_to_string(input, inner_call)
	case input
	when /^http/
	begin
	open(input) { \|f\| return f.read }
	rescue Exception => e
	puts e
	exit
	end
	when String
	begin
	File.open(input, "r") { \|f\| return f.read }
	rescue
	STDERR.puts "Argument has assumed as a text string" unless inner_call
	input
	end
	when ARGF.class
	input.read
	else
	raise "Wrong argument. ARGF, file or string are acceptable."
	end
	end

	def take_by_value(nth, sort_opt, &blk)
	val = lambda { \|key, val\| val }
	take_by_key_or_val(nth, sort_opt, val, &blk)
	end

	def take_by_key(nth, sort_opt, &blk)
	key = lambda { \|key, val\| key }
	take_by_key_or_val(nth, sort_opt, key, &blk)
	end

	def take_by_key_or_val(nth, sort_opt, by)
	@freq_dic.select { \|key, val\| block_given? ? yield(val) : val }.take_by(nth) { \|key, val\| sort_opt[by[key, val]] }
	end

	def arithmetics(op, other)
	result = (@words.send op, other.words).join(" ")
	WordDictionary.new(result, '', true)
	end
	end

	def pretty_print(data)
	max_stars = 60
	max_value = data.max_by { \|word, freq\| freq }.slice(1)
	data.each do \|word, freq\|
	stars = "" (max_stars * (freq/max_value.to_f)).ceil
	printf "%5d:%-5s %s\n", freq, word, stars
	end
	end

	if $0 == __FILE__
	base = WordDictionary.new('public/base.txt')
	alice = WordDictionary.new('public/alice.txt', "Alice's Adventures in Wonderland")
	jp_history = WordDictionary.new('public/japanese_history.txt')
	p alice.uniq_words(40, base)
	p jp_history.uniq_words(40, base)
	end