fasiha · August 29, 2015 14:23
diff --git a/freq.rb b/freq.rb
 # Lemma Frequency Report
 # Gem Depends: ve
 # System Depends: mecab, mecab-ipadic-utf-8
 # Usage: ruby freq.rb [FILE]
 require 've'

 def main
  # Input from stdin or args, UTF-8 required
  contents = ARGF.read
  # I'm using aozora bunko text as input, so the rubies need to be removed
  plain = contents.gsub(/《.*》/, "")
  # Process the text, this might take a while
  parsed = Ve.in(:ja).words(plain)
  # Get frequency of words not in the blacklist
  freq = count_lemmas(filter_blacklisted(parsed))
  # Show it
  show_count(freq)
 end

 def count_lemmas(words)
  # Now we have a list of words, let's take the lemmas,
  # which seem to be all we're interested in at the moment.
  # And count, using a hash
  lemma_counts = Hash.new(0)
  words.each do |word|
    unless word.lemma == "*" # if lemma could not be found, don't count
      lemma_counts[word.lemma] += 1
    end
  end

  lemma_counts
 end

 def filter_blacklisted(words)
  pos_blacklist = [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::ProperNoun]
  words.select { |word| not pos_blacklist.include? word.part_of_speech }
 end

 def show_count(lemma_counts)
  lemma_counts.sort_by{|_,count| count}.reverse.each do |lemma,count|
    puts "#{count}\t#{lemma}"
  end
 end

 main
	# Lemma Frequency Report
	# Gem Depends: ve
	# System Depends: mecab, mecab-ipadic-utf-8
	# Usage: ruby freq.rb [FILE]
	require 've'

	def main
	# Input from stdin or args, UTF-8 required
	contents = ARGF.read
	# I'm using aozora bunko text as input, so the rubies need to be removed
	plain = contents.gsub(/《.*》/, "")
	# Process the text, this might take a while
	parsed = Ve.in(:ja).words(plain)
	# Get frequency of words not in the blacklist
	freq = count_lemmas(filter_blacklisted(parsed))
	# Show it
	show_count(freq)
	end

	def count_lemmas(words)
	# Now we have a list of words, let's take the lemmas,
	# which seem to be all we're interested in at the moment.
	# And count, using a hash
	lemma_counts = Hash.new(0)
	words.each do \|word\|
	unless word.lemma == "*" # if lemma could not be found, don't count
	lemma_counts[word.lemma] += 1
	end
	end

	lemma_counts
	end

	def filter_blacklisted(words)
	pos_blacklist = [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::ProperNoun]
	words.select { \|word\| not pos_blacklist.include? word.part_of_speech }
	end

	def show_count(lemma_counts)
	lemma_counts.sort_by{\|_,count\| count}.reverse.each do \|lemma,count\|
	puts "#{count}\t#{lemma}"
	end
	end

	main