rickhull · September 21, 2011 18:50
diff --git a/TextAnalyzer-Ref b/TextAnalyzer-Ref
 #!/usr/bin/ruby
 # Text Analyzer
 # Author: Zach
 # Purpose: Utilize Ruby to analyze Text files
 # and generate statistical information therein.

 require 'cgi'

 STOPWORDS = File.readlines('conf/stop_words.txt').map{|x| x.chomp}

 ####   LIBRARY SECTION   ####

 def count_characters(text)
  text.length
 end

 def count_characters_less_spaces(text)
  count_characters(text.gsub(/\s+/, ''))
 end

 def count_words(text)
  text.split.length
 end

 def count_sentences(text)
  text.split(/\.|\?|!/).length
 end

 def count_paragraphs(text)
 text.split(/\n\n/).length
 end

 def count_lines(text)
  text.split("\n").length
 end

 def non_stopwords(text)
  text.split(/\s+/).select { |word| !STOPWORDS.include? word}
 end

 def calc_word_percent(text)
  ((non_stopwords(text).length.to_f / count_words(text)) * 100).to_i
 end

 def most_common_words(text)
  (non_stopwords(text) - STOPWORDS).group_by{ |x| x}.sort_by{ |word, hits| -hits.length}[1..10].map(&:first).join('-- ')
 end

 def ideal_sentences(sentences) # should return array - not hash
  sentences_sorted = sentences.sort_by { |sentence| sentence.length }
  foo = sentences_sorted.length / 6
  ideal_sentences = sentences_sorted.slice(foo, foo + 1)
  ideal_sentences.select { |sentence| sentence =~/\sis\W|\sare\W/ }
 end

 ####   END LIBRARY SECTION   ####

 ####   DRIVER SECTION   ####

 def build_reports (stats)
  reports = []  
  stats.each do |stat|
    report = <<-REPORT
    #{stat[:text]} Inaugural Speech - Analysis Results
    Lines:  #{stat[:lines]}
    Characters:  #{stat[:chars]}.
    Characters - White Space: #{stat[:chrspc]}
    Words: #{stat[:words]}.
    Sentences: #{stat[:sent]}
    Paragraphs: #{stat[:para]}
    Average Sentences / Paragraph: #{stat[:sent] / stat[:para]}
    Average Words / Sentence:  #{stat[:words] / stat[:sent]}
    #{stat[:pcnt]} % of all words in the text are non-fluff words
    Ideal sentences include: #{stat[:bsent].join("-- ")}
    The top 10 most common words are: #{stat[:mcwords]}\n
    REPORT
    reports << report
  end #end stats.each-loop
  reports
 end #end build_reports

 def collect_stats
  file_stats = []
  Dir.glob("files/*.txt").each do |text_file| # Iterate through all text files
    # Local Variables / Text Files
    text = File.read(text_file)
    basename = File.basename(text_file, '.txt').capitalize.gsub('_', ' ')
    sentences = text.gsub(/\s+/, ' ').strip.split(/\.|\?|!/)
    # Collect our stats and stuff them in the results array
    file_stats << {
      :chars  => count_characters(text), 
      :chrspc => count_characters_less_spaces(text), 
      :words  => count_words(text),
      :sent   => count_sentences(text),
      :para   => count_paragraphs(text),
      :kword  => non_stopwords(text),
      :pcnt   => calc_word_percent(text),
      :lines  => count_lines(text),
      :mcwords=> most_common_words(text),
      :bsent  => ideal_sentences(sentences),
      :text   => basename,
    }
  end # End each-loop
  # Return the filestats at the end
  file_stats
 end
 # Call to collect_stats method containing calls to the functions that process the text files passed to var(text)
 stats = collect_stats
 #Build reports that contain the values from our stats
 reports = build_reports(stats)

 header = <<HTML
 <html>
 <body>
 <pre>
 CS 132A Lab3
 Innagural Speech Analysis
 HTML

 footer = <<HTML
 </html>
 </body>
 </pre>
 HTML

 output = <<OUT
 #{header}
 #{reports}
 #{footer}
 OUT

 cgi = CGI.new
 cgi.out do
  output
 end
	#!/usr/bin/ruby
	# Text Analyzer
	# Author: Zach
	# Purpose: Utilize Ruby to analyze Text files
	# and generate statistical information therein.

	require 'cgi'

	STOPWORDS = File.readlines('conf/stop_words.txt').map{\|x\| x.chomp}

	#### LIBRARY SECTION ####

	def count_characters(text)
	text.length
	end

	def count_characters_less_spaces(text)
	count_characters(text.gsub(/\s+/, ''))
	end

	def count_words(text)
	text.split.length
	end

	def count_sentences(text)
	text.split(/\.\|\?\|!/).length
	end

	def count_paragraphs(text)
	text.split(/\n\n/).length
	end

	def count_lines(text)
	text.split("\n").length
	end

	def non_stopwords(text)
	text.split(/\s+/).select { \|word\| !STOPWORDS.include? word}
	end

	def calc_word_percent(text)
	((non_stopwords(text).length.to_f / count_words(text)) * 100).to_i
	end

	def most_common_words(text)
	(non_stopwords(text) - STOPWORDS).group_by{ \|x\| x}.sort_by{ \|word, hits\| -hits.length}[1..10].map(&:first).join('-- ')
	end

	def ideal_sentences(sentences) # should return array - not hash
	sentences_sorted = sentences.sort_by { \|sentence\| sentence.length }
	foo = sentences_sorted.length / 6
	ideal_sentences = sentences_sorted.slice(foo, foo + 1)
	ideal_sentences.select { \|sentence\| sentence =~/\sis\W\|\sare\W/ }
	end

	#### END LIBRARY SECTION ####

	#### DRIVER SECTION ####

	def build_reports (stats)
	reports = []
	stats.each do \|stat\|
	report = <<-REPORT
	#{stat[:text]} Inaugural Speech - Analysis Results
	Lines: #{stat[:lines]}
	Characters: #{stat[:chars]}.
	Characters - White Space: #{stat[:chrspc]}
	Words: #{stat[:words]}.
	Sentences: #{stat[:sent]}
	Paragraphs: #{stat[:para]}
	Average Sentences / Paragraph: #{stat[:sent] / stat[:para]}
	Average Words / Sentence: #{stat[:words] / stat[:sent]}
	#{stat[:pcnt]} % of all words in the text are non-fluff words
	Ideal sentences include: #{stat[:bsent].join("-- ")}
	The top 10 most common words are: #{stat[:mcwords]}\n
	REPORT
	reports << report
	end #end stats.each-loop
	reports
	end #end build_reports

	def collect_stats
	file_stats = []
	Dir.glob("files/*.txt").each do \|text_file\| # Iterate through all text files
	# Local Variables / Text Files
	text = File.read(text_file)
	basename = File.basename(text_file, '.txt').capitalize.gsub('_', ' ')
	sentences = text.gsub(/\s+/, ' ').strip.split(/\.\|\?\|!/)
	# Collect our stats and stuff them in the results array
	file_stats << {
	:chars => count_characters(text),
	:chrspc => count_characters_less_spaces(text),
	:words => count_words(text),
	:sent => count_sentences(text),
	:para => count_paragraphs(text),
	:kword => non_stopwords(text),
	:pcnt => calc_word_percent(text),
	:lines => count_lines(text),
	:mcwords=> most_common_words(text),
	:bsent => ideal_sentences(sentences),
	:text => basename,
	}
	end # End each-loop
	# Return the filestats at the end
	file_stats
	end
	# Call to collect_stats method containing calls to the functions that process the text files passed to var(text)
	stats = collect_stats
	#Build reports that contain the values from our stats
	reports = build_reports(stats)

	header = <<HTML
	<html>
	<body>
	<pre>
	CS 132A Lab3
	Innagural Speech Analysis
	HTML

	footer = <<HTML
	</html>
	</body>
	</pre>
	HTML

	output = <<OUT
	#{header}
	#{reports}
	#{footer}
	OUT

	cgi = CGI.new
	cgi.out do
	output
	end