-
-
Save fasiha/d825e32bf1c7d8602a13 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Lemma Frequency Report | |
# Gem Depends: ve | |
# System Depends: mecab, mecab-ipadic-utf-8 | |
# Usage: ruby freq.rb [FILE] | |
require 've' | |
def main | |
# Input from stdin or args, UTF-8 required | |
contents = ARGF.read | |
# I'm using aozora bunko text as input, so the rubies need to be removed | |
plain = contents.gsub(/《.*》/, "") | |
# Process the text, this might take a while | |
parsed = Ve.in(:ja).words(plain) | |
# Get frequency of words not in the blacklist | |
freq = count_lemmas(filter_blacklisted(parsed)) | |
# Show it | |
show_count(freq) | |
end | |
def count_lemmas(words) | |
# Now we have a list of words, let's take the lemmas, | |
# which seem to be all we're interested in at the moment. | |
# And count, using a hash | |
lemma_counts = Hash.new(0) | |
words.each do |word| | |
unless word.lemma == "*" # if lemma could not be found, don't count | |
lemma_counts[word.lemma] += 1 | |
end | |
end | |
lemma_counts | |
end | |
def filter_blacklisted(words) | |
pos_blacklist = [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::ProperNoun] | |
words.select { |word| not pos_blacklist.include? word.part_of_speech } | |
end | |
def show_count(lemma_counts) | |
lemma_counts.sort_by{|_,count| count}.reverse.each do |lemma,count| | |
puts "#{count}\t#{lemma}" | |
end | |
end | |
main |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment