-
-
Save rickhull/1232962 to your computer and use it in GitHub Desktop.
Refractored TextAnalyzer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
# Text Analyzer | |
# Author: Zach | |
# Purpose: Utilize Ruby to analyze Text files | |
# and generate statistical information therein. | |
require 'cgi' | |
STOPWORDS = File.readlines('conf/stop_words.txt').map{|x| x.chomp} | |
#### LIBRARY SECTION #### | |
def count_characters(text) | |
text.length | |
end | |
def count_characters_less_spaces(text) | |
count_characters(text.gsub(/\s+/, '')) | |
end | |
def count_words(text) | |
text.split.length | |
end | |
def count_sentences(text) | |
text.split(/\.|\?|!/).length | |
end | |
def count_paragraphs(text) | |
text.split(/\n\n/).length | |
end | |
def count_lines(text) | |
text.split("\n").length | |
end | |
def non_stopwords(text) | |
text.split(/\s+/).select { |word| !STOPWORDS.include? word} | |
end | |
def calc_word_percent(text) | |
((non_stopwords(text).length.to_f / count_words(text)) * 100).to_i | |
end | |
def most_common_words(text) | |
(non_stopwords(text) - STOPWORDS).group_by{ |x| x}.sort_by{ |word, hits| -hits.length}[1..10].map(&:first).join('-- ') | |
end | |
def ideal_sentences(sentences) # should return array - not hash | |
sentences_sorted = sentences.sort_by { |sentence| sentence.length } | |
foo = sentences_sorted.length / 6 | |
ideal_sentences = sentences_sorted.slice(foo, foo + 1) | |
ideal_sentences.select { |sentence| sentence =~/\sis\W|\sare\W/ } | |
end | |
#### END LIBRARY SECTION #### | |
#### DRIVER SECTION #### | |
def build_reports (stats) | |
reports = [] | |
stats.each do |stat| | |
report = <<-REPORT | |
#{stat[:text]} Inaugural Speech - Analysis Results | |
Lines: #{stat[:lines]} | |
Characters: #{stat[:chars]}. | |
Characters - White Space: #{stat[:chrspc]} | |
Words: #{stat[:words]}. | |
Sentences: #{stat[:sent]} | |
Paragraphs: #{stat[:para]} | |
Average Sentences / Paragraph: #{stat[:sent] / stat[:para]} | |
Average Words / Sentence: #{stat[:words] / stat[:sent]} | |
#{stat[:pcnt]} % of all words in the text are non-fluff words | |
Ideal sentences include: #{stat[:bsent].join("-- ")} | |
The top 10 most common words are: #{stat[:mcwords]}\n | |
REPORT | |
reports << report | |
end #end stats.each-loop | |
reports | |
end #end build_reports | |
def collect_stats | |
file_stats = [] | |
Dir.glob("files/*.txt").each do |text_file| # Iterate through all text files | |
# Local Variables / Text Files | |
text = File.read(text_file) | |
basename = File.basename(text_file, '.txt').capitalize.gsub('_', ' ') | |
sentences = text.gsub(/\s+/, ' ').strip.split(/\.|\?|!/) | |
# Collect our stats and stuff them in the results array | |
file_stats << { | |
:chars => count_characters(text), | |
:chrspc => count_characters_less_spaces(text), | |
:words => count_words(text), | |
:sent => count_sentences(text), | |
:para => count_paragraphs(text), | |
:kword => non_stopwords(text), | |
:pcnt => calc_word_percent(text), | |
:lines => count_lines(text), | |
:mcwords=> most_common_words(text), | |
:bsent => ideal_sentences(sentences), | |
:text => basename, | |
} | |
end # End each-loop | |
# Return the filestats at the end | |
file_stats | |
end | |
# Call to collect_stats method containing calls to the functions that process the text files passed to var(text) | |
stats = collect_stats | |
#Build reports that contain the values from our stats | |
reports = build_reports(stats) | |
header = <<HTML | |
<html> | |
<body> | |
<pre> | |
CS 132A Lab3 | |
Innagural Speech Analysis | |
HTML | |
footer = <<HTML | |
</html> | |
</body> | |
</pre> | |
HTML | |
output = <<OUT | |
#{header} | |
#{reports} | |
#{footer} | |
OUT | |
cgi = CGI.new | |
cgi.out do | |
output | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment