Skip to content

Instantly share code, notes, and snippets.

@rickhull
Forked from stuckinthecloud/gist:1227555
Created September 21, 2011 18:50
Show Gist options
  • Save rickhull/1232962 to your computer and use it in GitHub Desktop.
Save rickhull/1232962 to your computer and use it in GitHub Desktop.
Refractored TextAnalyzer
#!/usr/bin/ruby
# Text Analyzer
# Author: Zach
# Purpose: Utilize Ruby to analyze Text files
# and generate statistical information therein.
require 'cgi'
STOPWORDS = File.readlines('conf/stop_words.txt').map{|x| x.chomp}
#### LIBRARY SECTION ####
def count_characters(text)
text.length
end
def count_characters_less_spaces(text)
count_characters(text.gsub(/\s+/, ''))
end
def count_words(text)
text.split.length
end
def count_sentences(text)
text.split(/\.|\?|!/).length
end
def count_paragraphs(text)
text.split(/\n\n/).length
end
def count_lines(text)
text.split("\n").length
end
def non_stopwords(text)
text.split(/\s+/).select { |word| !STOPWORDS.include? word}
end
def calc_word_percent(text)
((non_stopwords(text).length.to_f / count_words(text)) * 100).to_i
end
def most_common_words(text)
(non_stopwords(text) - STOPWORDS).group_by{ |x| x}.sort_by{ |word, hits| -hits.length}[1..10].map(&:first).join('-- ')
end
def ideal_sentences(sentences) # should return array - not hash
sentences_sorted = sentences.sort_by { |sentence| sentence.length }
foo = sentences_sorted.length / 6
ideal_sentences = sentences_sorted.slice(foo, foo + 1)
ideal_sentences.select { |sentence| sentence =~/\sis\W|\sare\W/ }
end
#### END LIBRARY SECTION ####
#### DRIVER SECTION ####
def build_reports (stats)
reports = []
stats.each do |stat|
report = <<-REPORT
#{stat[:text]} Inaugural Speech - Analysis Results
Lines: #{stat[:lines]}
Characters: #{stat[:chars]}.
Characters - White Space: #{stat[:chrspc]}
Words: #{stat[:words]}.
Sentences: #{stat[:sent]}
Paragraphs: #{stat[:para]}
Average Sentences / Paragraph: #{stat[:sent] / stat[:para]}
Average Words / Sentence: #{stat[:words] / stat[:sent]}
#{stat[:pcnt]} % of all words in the text are non-fluff words
Ideal sentences include: #{stat[:bsent].join("-- ")}
The top 10 most common words are: #{stat[:mcwords]}\n
REPORT
reports << report
end #end stats.each-loop
reports
end #end build_reports
def collect_stats
file_stats = []
Dir.glob("files/*.txt").each do |text_file| # Iterate through all text files
# Local Variables / Text Files
text = File.read(text_file)
basename = File.basename(text_file, '.txt').capitalize.gsub('_', ' ')
sentences = text.gsub(/\s+/, ' ').strip.split(/\.|\?|!/)
# Collect our stats and stuff them in the results array
file_stats << {
:chars => count_characters(text),
:chrspc => count_characters_less_spaces(text),
:words => count_words(text),
:sent => count_sentences(text),
:para => count_paragraphs(text),
:kword => non_stopwords(text),
:pcnt => calc_word_percent(text),
:lines => count_lines(text),
:mcwords=> most_common_words(text),
:bsent => ideal_sentences(sentences),
:text => basename,
}
end # End each-loop
# Return the filestats at the end
file_stats
end
# Call to collect_stats method containing calls to the functions that process the text files passed to var(text)
stats = collect_stats
#Build reports that contain the values from our stats
reports = build_reports(stats)
header = <<HTML
<html>
<body>
<pre>
CS 132A Lab3
Innagural Speech Analysis
HTML
footer = <<HTML
</html>
</body>
</pre>
HTML
output = <<OUT
#{header}
#{reports}
#{footer}
OUT
cgi = CGI.new
cgi.out do
output
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment