Created
September 19, 2011 20:29
-
-
Save rickhull/1227523 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
# Text Analyzer | |
# Author: Zach | |
# Purpose: Utilize Ruby to analyze Text files | |
# and generate statistical information therein. | |
require 'cgi' | |
STOPWORDS = File.read('conf/stop_words.txt').map{|x| x.chomp} | |
# Count the characters; return a hash with stats | |
def total_chars(text) | |
text.length | |
end | |
def total_chars_minus_spaces(text) | |
total_chars(text.gsub(/\s+/, '')) | |
end | |
# Count the words, sentences and paragraphs; return a hash with stats | |
def count_chunks(text) | |
{ | |
:word_count => text.split.length, | |
:sent_count => text.split(/\.|\?|!/).length, | |
:para_count => text.split(/\n\n/).length, | |
:line_count => text.size, | |
:keywords => text.split(/\s+/).select { |word| !STOPWORDS.include?(word) }, | |
} | |
end | |
def useful_words(word_count, keywords) | |
{ | |
:pgw => ((keywords.length.to_f / word_count.to_f) * 100).to_i, | |
:most_common_words => (keywords - STOPWORDS).group_by{ |x| x}.sort_by{ |word, hits| -hits.length}[0..9].map(&:first).join('-- ') | |
} | |
end | |
# Display stats for the sentences | |
def ideal_sentences(sentences) | |
sentences_sorted = sentences.sort_by { |sentence| sentence.length } | |
foo = sentences_sorted.length / 7 | |
ideal_sentences = sentences_sorted.slice(foo, foo + 1) | |
{ :ideal_sentences => ideal_sentences.select { |sentence| sentence =~/\sis\W|\sare\W/ } } | |
end | |
# Returns | |
def file_names(stats) | |
{ :names => stats.map{ |h| h[:text]} } | |
end | |
def build_reports (stats) | |
reports = [] | |
stats.each do |stat| | |
report = <<-REPORT | |
#{stat[:text]} Inaugural Speech - Analysis Results | |
Total number of characters is: #{stat[:chars][:tot_chars]}. | |
Total number of characters less whitespace is: #{stat[:chars][:tot_chars_no_space]} | |
Total number of words is: #{stat[:chunks][:word_count]}. | |
Total number of sentences is: #{stat[:chunks][:sent_count]} | |
Total number of paragraphs is: #{stat[:chunks][:para_count]} | |
The average sentences per paragraph is: #{stat[:chunks][:sent_count] / stat[:chunks][:para_count]} | |
The average words per sentence is: #{stat[:chunks][:word_count] / stat[:chunks][:sent_count]} | |
#{stat[:words][:pgw]} % of all words in the text are non-fluff words. | |
The ideal sentences are: #{stat[:sent][:ideal_sentences].join("-- ")} | |
The top 10 most common words are: #{stat[:words][:most_common_words]} | |
REPORT | |
reports << report | |
end #end each-loop | |
reports | |
end #end build_reports | |
def collect_stats | |
file_stats = [] | |
Dir.glob("files/*.txt").each do |text_file| # Iterate through all text files | |
# Local Variables / Text Files | |
text = File.read(text_file).sub('/files/', '').sub('.txt', '') | |
sentences = text.gsub(/\s+/, ' ').strip.split(/\.|\?|!/) | |
# Count the characters | |
char_stats = count_chars(text) #Assign returned results from count_chars method | |
# Count the words, sentences and paragraphs | |
chunk_stats = count_chunks(text) #Assign returned retults to chunk_stats | |
# Figure out the most Useful Words | |
word_stats = useful_words(chunk_stats[:word_count],chunk_stats[:keywords]) #assign results to word_stats | |
# Call to ideal_sentences which will produce the ideal sentences from text | |
best_sentences = ideal_sentences(sentences) #assign results to best_sentences | |
# Collect our stats and stuff them in the results array | |
file_stats << { | |
:chars => char_stats, | |
:chunks => chunk_stats, | |
:words => word_stats, | |
:sent => best_sentences, | |
:text => text_file, | |
} | |
end # End each-loop | |
# Return the filestats at the end | |
file_stats | |
end | |
# Call to collect_stats method containing calls to the functions that process the text files passed to var(text) | |
stats = collect_stats | |
# Process the file name(s) of the text files for use in the build_reports method | |
names = file_names(stats) | |
#Build reports that contain the values from our stats | |
reports = build_reports(stats) | |
header = <<HTML | |
<html> | |
<body> | |
<pre> | |
CS 132A Lab3 | |
Innagural Speech Analysis | |
HTML | |
footer = <<HTML | |
</html> | |
</body> | |
</pre> | |
HTML | |
output = <<OUT | |
#{header} | |
#{reports} | |
#{footer} | |
OUT | |
cgi = CGI.new | |
cgi.out do | |
output | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment