Skip to content

Instantly share code, notes, and snippets.

@rtanglao
Created March 14, 2010 09:49
Show Gist options
  • Save rtanglao/331893 to your computer and use it in GitHub Desktop.
Save rtanglao/331893 to your computer and use it in GitHub Desktop.
getwords.rb
#!/usr/bin/env ruby
require 'json'
require 'net/http'
require 'pp'
require 'Time'
STOP_WORDS = ["thunderbird", "email", "e-mail", "mail", "thunderbird3", "tbird", "tbird3", "tb", "emails", "mails", "e-mails", "tb3", "tb2", "support", "help", "error", "support", "please", "new", "ok", "message", "messages", "thanks", "got", "page", "two", "etc", "etc", "e.g.", "i.e", "fix", "computer", "seems", "right", "like", "fine", "also", "first", "fix", "worked", "something", "trying", "even", "much", "every", 'client', "different", "may", "since", "default", "problem", "many", "hi", "mozilla", "bug", "feature", "already", "unable", "using", "use", "one", "anyone", "however", "anything", "wrong", "now", "think", "found", "see", "still", "want", "might", "answer", "going", "question", "else", "used", "user", "appears", "line", "problems", "questions", "works", "thank", "works", "really", "great", "good", "well", "everything", "mac", "lot", "nothing", "nothing", "correct", "firefox", "people", "just", "get", "set" ]
BREAK_STOP = "<br />"
S3_STOP = "http://s3.amazonaws.com/satisfaction-production/s3_images"
BUGZILLA_STOP = "https://bugzilla.org/show"
HREF_STOP="href"
NOFOLLOW_STOP='rel="nofollow"'
IMG_STOP='<img src='
ALT_STOP="alt="
def removeIrrelevantWords(str)
str = str.downcase
str = str.gsub(/\b(#{STOP_WORDS.join('|')})\b/mi, '')
str = str.gsub(/(#{BREAK_STOP})/, ' ')
str = str.gsub(/(#{S3_STOP})/, ' ')
str = str.gsub(/(#{BUGZILLA_STOP})/, ' ')
str = str.gsub(/(#{HREF_STOP})/, ' ')
str = str.gsub(/(#{NOFOLLOW_STOP})/, ' ')
str = str.gsub(/(#{IMG_STOP})/, ' ')
str = str.gsub(/(#{ALT_STOP})/, ' ')
end
def getResponse(url)
http = Net::HTTP.new("api.getsatisfaction.com",80)
url = "/" + url
resp, data = http.get(url, nil)
if resp.code != "200"
printf(STDERR,"Error:%d\n", resp.code)
return ""
end
result = JSON.parse(data)
return result
end
if ARGV.length < 6
puts "usage: #{$0} yyyy mm dd yyyy mmm dd"
exit
end
metrics_start = Time.utc(ARGV[0], ARGV[1], ARGV[2], 0, 0)
metrics_start -= 1
metrics_stop = Time.utc(ARGV[3], ARGV[4], ARGV[5], 23, 59)
metrics_stop += 1
roland_replies = 0
non_roland_replies = 0
topic_page = 0
#topic_page = 5
end_program = false
repliesByUser={}
while true
topic_page += 1
skip = false
topic_url = "products/mozilla_thunderbird/topics.json?sort=recently_active&page=" << "%d" % topic_page << "&limit=30"
printf(STDERR, "topic_url")
begin
topics = getResponse(topic_url)
rescue JSON::ParserError
printf(STDERR, "Parser error in topic:%s\n", topic_url)
skip = true
end
if skip
skip = false
next
end
topics["data"].each do|topic|
printf(STDOUT, "%s ", removeIrrelevantWords(topic["subject"]))
printf(STDOUT, "%s ", removeIrrelevantWords(topic["content"]))
last_active_at = Time.parse(topic["last_active_at"])
last_active_at = last_active_at.utc
printf(STDERR, "TOPIC last_active_at:%s\n", last_active_at)
if (last_active_at <=> (metrics_start + 1)) == -1
printf(STDERR, "ending program\n")
end_program = true
break
end
printf(STDERR, "START*** of topic\n")
PP::pp(topic,$stderr)
printf(STDERR, "\nEND*** of topic\n")
reply_count = topic["reply_count"]
printf(STDERR, "reply_count:%d\n", reply_count)
reply_page = 1
if reply_count != 0
begin
get_reply_str = "topics/" << topic["slug"] << "/replies.json?sort=recently_created&page=" << "%d" % reply_page << "&limit=30"
PP::pp(get_reply_str, $stderr)
skip_replies = false
begin
replies = getResponse(get_reply_str)
rescue JSON::ParserError
printf(STDERR, "Parser error in reply:%s\n", get_reply_str)
skip_replies = true
end
if skip_replies
skip_replies = false
reply_count -= 30
reply_page += 1
next
end
replies["data"].each do|reply|
printf(STDERR, "START*** of reply\n")
PP::pp(reply, $stderr)
printf(STDERR, "\nEND*** of reply\n")
author = reply["author"]["name"]
reply_created_time = Time.parse(reply["created_at"])
reply_created_time = reply_created_time.utc
topic_id = reply["topic_id"]
reply_id = reply["id"]
printf(STDERR, "RRR: reply created time:%s\n", reply_created_time)
if (reply_created_time <=> metrics_start) == 1 &&
(reply_created_time <=> metrics_stop) == -1
printf(STDOUT, "%s ", removeIrrelevantWords(reply["content"]))
end
end # replies ... do
reply_count -= 30
reply_page += 1
end while reply_count > 0
end
end
if end_program
break
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment