Last active
August 13, 2016 20:34
-
-
Save BaseCase/7e7a38d45557fc012e5a8fa0ade410c3 to your computer and use it in GitHub Desktop.
Quick and dirty word frequency count for (more or less) just the text part of an HTML file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
class WordFrequencyCounter | |
DELIMITERS = [ | |
" ", | |
"\n", | |
"<", | |
">", | |
".", | |
"?", | |
"!", | |
",", | |
";", | |
":", | |
"—", | |
"(", | |
")", | |
"{", | |
"}", | |
"@", | |
] | |
def initialize(input_string) | |
@input = input_string | |
@counts = Hash.new(0) | |
end | |
def count | |
tokenized.map(&:downcase).each do |word| | |
@counts[word] += 1 | |
end | |
@counts | |
end | |
private | |
def tokenized | |
inside_tag = false | |
current_word = [] | |
words = [] | |
@input.each_char do |c| | |
if DELIMITERS.include? c | |
if c == '<' then inside_tag = true end | |
if c == '>' then inside_tag = false end | |
if !inside_tag && !current_word.empty? | |
words << current_word.join | |
current_word.clear | |
end | |
else | |
if !inside_tag | |
current_word << c | |
end | |
end | |
end | |
words | |
end | |
end | |
if __FILE__ == $0 | |
wordcounts = WordFrequencyCounter.new(ARGF.read).count | |
wordcounts.each do |k,v| | |
$stdout.write "#{v} #{k}\n" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
When I run something like this:
then I get output like this (truncated from actual):