rtanglao · March 14, 2010 09:49
diff --git a/getwords.rb b/getwords.rb
 #!/usr/bin/env ruby
 require 'json'
 require 'net/http'
 require 'pp'
 require 'Time'

 STOP_WORDS = ["thunderbird", "email", "e-mail", "mail", "thunderbird3", "tbird", "tbird3", "tb", "emails", "mails", "e-mails", "tb3", "tb2", "support", "help", "error", "support", "please", "new", "ok", "message", "messages", "thanks", "got", "page", "two", "etc", "etc", "e.g.", "i.e", "fix", "computer", "seems", "right", "like", "fine", "also", "first", "fix", "worked", "something", "trying", "even", "much", "every", 'client', "different", "may", "since", "default", "problem", "many", "hi", "mozilla", "bug", "feature", "already", "unable", "using", "use", "one", "anyone", "however", "anything", "wrong", "now", "think", "found", "see", "still", "want", "might", "answer", "going", "question", "else", "used", "user", "appears", "line", "problems", "questions", "works", "thank", "works", "really", "great", "good", "well", "everything", "mac", "lot", "nothing", "nothing", "correct", "firefox", "people", "just", "get", "set" ]
 BREAK_STOP = "<br />"
 S3_STOP = "http://s3.amazonaws.com/satisfaction-production/s3_images"
 BUGZILLA_STOP = "https://bugzilla.org/show"
 HREF_STOP="href"
 NOFOLLOW_STOP='rel="nofollow"'
 IMG_STOP='<img src='
 ALT_STOP="alt="

 def removeIrrelevantWords(str)
  str = str.downcase

  str = str.gsub(/\b(#{STOP_WORDS.join('|')})\b/mi, '')
  str = str.gsub(/(#{BREAK_STOP})/, ' ')
  str = str.gsub(/(#{S3_STOP})/, ' ')
  str = str.gsub(/(#{BUGZILLA_STOP})/, ' ')
  str = str.gsub(/(#{HREF_STOP})/, ' ')
  str = str.gsub(/(#{NOFOLLOW_STOP})/, ' ')
  str = str.gsub(/(#{IMG_STOP})/, ' ')
  str = str.gsub(/(#{ALT_STOP})/, ' ')

 end

 def getResponse(url)

  http = Net::HTTP.new("api.getsatisfaction.com",80)

  url = "/" + url 

  resp, data = http.get(url, nil)
   
  if resp.code != "200"
    printf(STDERR,"Error:%d\n", resp.code)
    return ""
  end

  result = JSON.parse(data)
  return result
 end

 if ARGV.length < 6
  puts "usage: #{$0} yyyy mm dd yyyy mmm dd"
  exit
 end

 metrics_start = Time.utc(ARGV[0], ARGV[1], ARGV[2], 0, 0)
 metrics_start -= 1
 metrics_stop =  Time.utc(ARGV[3], ARGV[4], ARGV[5], 23, 59)
 metrics_stop += 1
 roland_replies = 0
 non_roland_replies = 0
 topic_page = 0
 #topic_page = 5
 end_program = false
 repliesByUser={}

 while true
  topic_page += 1
  skip = false
  topic_url = "products/mozilla_thunderbird/topics.json?sort=recently_active&page=" << "%d" % topic_page << "&limit=30"
  printf(STDERR, "topic_url")
  begin
    topics = getResponse(topic_url)
  rescue JSON::ParserError
    printf(STDERR, "Parser error in topic:%s\n", topic_url)

    skip = true
  end
  if skip
    skip = false
    next
  end
  topics["data"].each do|topic|
    printf(STDOUT, "%s ", removeIrrelevantWords(topic["subject"]))
    printf(STDOUT, "%s ", removeIrrelevantWords(topic["content"]))
    last_active_at = Time.parse(topic["last_active_at"])
    last_active_at = last_active_at.utc
    printf(STDERR, "TOPIC last_active_at:%s\n", last_active_at)

    if (last_active_at <=> (metrics_start + 1)) == -1 
      printf(STDERR, "ending program\n")
      end_program = true
      break
    end

    printf(STDERR, "START*** of topic\n")
    PP::pp(topic,$stderr)
    printf(STDERR, "\nEND*** of topic\n")
    reply_count = topic["reply_count"]
  
    printf(STDERR, "reply_count:%d\n", reply_count)
    reply_page = 1
    if reply_count != 0
      begin
        get_reply_str = "topics/" << topic["slug"] << "/replies.json?sort=recently_created&page=" << "%d" % reply_page << "&limit=30"

        PP::pp(get_reply_str, $stderr)

        skip_replies = false
        begin
          replies = getResponse(get_reply_str)
        rescue JSON::ParserError
          printf(STDERR, "Parser error in reply:%s\n", get_reply_str)

          skip_replies = true
        end
        if skip_replies
          skip_replies = false
          reply_count -= 30
          reply_page += 1 
          next
        end

        replies["data"].each do|reply|
    
          printf(STDERR, "START*** of reply\n")
          PP::pp(reply, $stderr)

          printf(STDERR, "\nEND*** of reply\n")

          author = reply["author"]["name"]
          reply_created_time = Time.parse(reply["created_at"])
          reply_created_time = reply_created_time.utc
          topic_id = reply["topic_id"]
          reply_id = reply["id"]

          printf(STDERR, "RRR: reply created time:%s\n", reply_created_time)

          if (reply_created_time <=> metrics_start) == 1 &&
             (reply_created_time <=> metrics_stop) == -1
            printf(STDOUT, "%s ", removeIrrelevantWords(reply["content"]))
          end
        end # replies ... do
        reply_count -= 30
        reply_page += 1 
      end while reply_count > 0
    end
  end 
  if end_program
    break
  end
 end
	#!/usr/bin/env ruby
	require 'json'
	require 'net/http'
	require 'pp'
	require 'Time'

	STOP_WORDS = ["thunderbird", "email", "e-mail", "mail", "thunderbird3", "tbird", "tbird3", "tb", "emails", "mails", "e-mails", "tb3", "tb2", "support", "help", "error", "support", "please", "new", "ok", "message", "messages", "thanks", "got", "page", "two", "etc", "etc", "e.g.", "i.e", "fix", "computer", "seems", "right", "like", "fine", "also", "first", "fix", "worked", "something", "trying", "even", "much", "every", 'client', "different", "may", "since", "default", "problem", "many", "hi", "mozilla", "bug", "feature", "already", "unable", "using", "use", "one", "anyone", "however", "anything", "wrong", "now", "think", "found", "see", "still", "want", "might", "answer", "going", "question", "else", "used", "user", "appears", "line", "problems", "questions", "works", "thank", "works", "really", "great", "good", "well", "everything", "mac", "lot", "nothing", "nothing", "correct", "firefox", "people", "just", "get", "set" ]
	BREAK_STOP = "<br />"
	S3_STOP = "http://s3.amazonaws.com/satisfaction-production/s3_images"
	BUGZILLA_STOP = "https://bugzilla.org/show"
	HREF_STOP="href"
	NOFOLLOW_STOP='rel="nofollow"'
	IMG_STOP='<img src='
	ALT_STOP="alt="

	def removeIrrelevantWords(str)
	str = str.downcase

	str = str.gsub(/\b(#{STOP_WORDS.join('\|')})\b/mi, '')
	str = str.gsub(/(#{BREAK_STOP})/, ' ')
	str = str.gsub(/(#{S3_STOP})/, ' ')
	str = str.gsub(/(#{BUGZILLA_STOP})/, ' ')
	str = str.gsub(/(#{HREF_STOP})/, ' ')
	str = str.gsub(/(#{NOFOLLOW_STOP})/, ' ')
	str = str.gsub(/(#{IMG_STOP})/, ' ')
	str = str.gsub(/(#{ALT_STOP})/, ' ')

	end

	def getResponse(url)

	http = Net::HTTP.new("api.getsatisfaction.com",80)

	url = "/" + url

	resp, data = http.get(url, nil)

	if resp.code != "200"
	printf(STDERR,"Error:%d\n", resp.code)
	return ""
	end

	result = JSON.parse(data)
	return result
	end

	if ARGV.length < 6
	puts "usage: #{$0} yyyy mm dd yyyy mmm dd"
	exit
	end

	metrics_start = Time.utc(ARGV[0], ARGV[1], ARGV[2], 0, 0)
	metrics_start -= 1
	metrics_stop = Time.utc(ARGV[3], ARGV[4], ARGV[5], 23, 59)
	metrics_stop += 1
	roland_replies = 0
	non_roland_replies = 0
	topic_page = 0
	#topic_page = 5
	end_program = false
	repliesByUser={}

	while true
	topic_page += 1
	skip = false
	topic_url = "products/mozilla_thunderbird/topics.json?sort=recently_active&page=" << "%d" % topic_page << "&limit=30"
	printf(STDERR, "topic_url")
	begin
	topics = getResponse(topic_url)
	rescue JSON::ParserError
	printf(STDERR, "Parser error in topic:%s\n", topic_url)

	skip = true
	end
	if skip
	skip = false
	next
	end
	topics["data"].each do\|topic\|
	printf(STDOUT, "%s ", removeIrrelevantWords(topic["subject"]))
	printf(STDOUT, "%s ", removeIrrelevantWords(topic["content"]))
	last_active_at = Time.parse(topic["last_active_at"])
	last_active_at = last_active_at.utc
	printf(STDERR, "TOPIC last_active_at:%s\n", last_active_at)

	if (last_active_at <=> (metrics_start + 1)) == -1
	printf(STDERR, "ending program\n")
	end_program = true
	break
	end

	printf(STDERR, "START*** of topic\n")
	PP::pp(topic,$stderr)
	printf(STDERR, "\nEND*** of topic\n")
	reply_count = topic["reply_count"]

	printf(STDERR, "reply_count:%d\n", reply_count)
	reply_page = 1
	if reply_count != 0
	begin
	get_reply_str = "topics/" << topic["slug"] << "/replies.json?sort=recently_created&page=" << "%d" % reply_page << "&limit=30"

	PP::pp(get_reply_str, $stderr)

	skip_replies = false
	begin
	replies = getResponse(get_reply_str)
	rescue JSON::ParserError
	printf(STDERR, "Parser error in reply:%s\n", get_reply_str)

	skip_replies = true
	end
	if skip_replies
	skip_replies = false
	reply_count -= 30
	reply_page += 1
	next
	end

	replies["data"].each do\|reply\|

	printf(STDERR, "START*** of reply\n")
	PP::pp(reply, $stderr)

	printf(STDERR, "\nEND*** of reply\n")

	author = reply["author"]["name"]
	reply_created_time = Time.parse(reply["created_at"])
	reply_created_time = reply_created_time.utc
	topic_id = reply["topic_id"]
	reply_id = reply["id"]

	printf(STDERR, "RRR: reply created time:%s\n", reply_created_time)

	if (reply_created_time <=> metrics_start) == 1 &&
	(reply_created_time <=> metrics_stop) == -1
	printf(STDOUT, "%s ", removeIrrelevantWords(reply["content"]))
	end
	end # replies ... do
	reply_count -= 30
	reply_page += 1
	end while reply_count > 0
	end
	end
	if end_program
	break
	end
	end
No results found