Created
February 21, 2010 00:18
-
-
Save lukas/310002 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'json' | |
require 'ruby-crowdflower' | |
require 'builder' | |
require 'sinatra' | |
CrowdFlower.connect!('6d96e6206e025396d62814c905eebc0687349b42') | |
#JobId = 4859 | |
@@jobId = File.open(File.dirname(__FILE__) + "/job_id").readline.chomp.to_i | |
Mapping = | |
{ | |
"address" => "address", | |
"city" => "city", | |
"first_name" => "firstname", | |
"last_name" => "lastname", | |
"department" => "department", | |
"author" => "author", | |
"_updated_at" => "updated", | |
"categoryaid_request" => "categorization", | |
"notes" => "notes", | |
"sms_translation" => "summary", | |
"gender" => "gender", | |
"carrier_id" => "carrierid", | |
"status" => "status", | |
"title" => "sms" | |
} | |
def get_raw_judgment(cf_j) | |
if cf_j.is_a?(String) | |
return cf_j | |
elsif cf_j.is_a?(Time) | |
return cf_j.to_s | |
elsif cf_j.is_a?(Array) | |
return cf_j[0] | |
elsif cf_j["res"] | |
return cf_j["res"][0] | |
end | |
raise | |
end | |
def crowdflower_judgment_to_u_judgment(id, cf_judgment) | |
puts "ID: #{id}" | |
#puts cf_judgment.inspect | |
u_judgment = {} | |
Mapping.each_pair do |cf_term, u_term| | |
u_judgment[u_term] = "" | |
if cf_judgment[cf_term] | |
puts cf_term | |
puts cf_judgment[cf_term] | |
puts cf_judgment[cf_term].class | |
u_judgment[u_term] = get_raw_judgment(cf_judgment[cf_term]) | |
end | |
end | |
u_judgment["id"] = id | |
u_judgment["georss:point"] = "#{get_raw_judgment(cf_judgment["latitude"])} #{get_raw_judgment(cf_judgment["longitude"])}" | |
u_judgment | |
end | |
def generate_feed(u_judgments) | |
buffer = | |
'<?xml version="1.0" encoding="utf-8"?> | |
<feed xmlns="http://www.w3.org/2005/Atom" xmlns:georss="http://www.georss.org/georss"> | |
' | |
xml = Builder::XmlMarkup.new(:indent => 2, :target => buffer) | |
xml.title "4636.crowdflower.com" | |
xml.link :href => "http://4636.crowdflower.com" | |
xml.id "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" | |
if u_judgments.size > 0 | |
xml.updated Time.parse(u_judgments[0]["updated"]).xmlschema | |
end | |
xml.author { xml.name "CrowdFlower" } | |
u_judgments.each do |u_judgment| | |
xml.entry do | |
xml.title "#{u_judgment["firstname"]} #{u_judgment["lastname"]} at #{u_judgment["georss:point"]}" | |
xml.link :href => "http://4636.crowdflower.com" | |
u_judgment.each_pair do |key, value| | |
xml.tag!(key, key == "updated" ? Time.parse(value).xmlschema : value) | |
end | |
end | |
end | |
#puts buffer | |
buffer <<= "</feed>" | |
buffer | |
end | |
get '/' do | |
'Hello world!' | |
end | |
# [16:10:13] Robert Munro: Are they requesting all N records at once each time? | |
# [16:53:39] Brian Herbert: no | |
# [16:53:44] Brian Herbert: gotta paginate | |
# [16:53:51] Brian Herbert: &limit=0,10 | |
# [16:53:55] Robert Munro: clear | |
# [16:53:56] Brian Herbert: &limit=10,10 | |
# [16:53:58] Brian Herbert: etc | |
# [16:53:59] Brian Herbert: also | |
# [16:54:05] Brian Herbert: we have timestamp filtering | |
# [16:54:44] Brian Herbert: &uptots=[UNIXTIMESTAMP] and &sincets=[UNIXTIMESTAMP] | |
# [16:54:56] Brian Herbert: and &category=4,5,6 | |
# [16:55:08] Brian Herbert: or &category=4a,4b | |
# [16:55:08] Brian Herbert: and &carrierid=1 or 2 | |
def judgment_satisfy_params?(u_j, params) | |
if (params[:uptots]) | |
j_time = Time.parse(u_j["updated"]) | |
up_to_time = Time.at(params[:uptots].to_i) | |
return false unless j_time < up_to_time | |
end | |
if (params[:sincets]) | |
j_time = Time.parse(u_j["updated"]) | |
up_to_time = Time.at(params[:sincets].to_i) | |
return false unless j_time > up_to_time | |
end | |
if (params[:category]) | |
categories = params[:category].split(",") | |
match = categories.any? do |cat| # is cat a substring of the category | |
return false if !u_j["categorization"] | |
u_j["categorization"][cat] | |
end | |
if !match | |
return false | |
end | |
end | |
if (params[:carrierid]) | |
if u_j["carrierid"] != params[:carrierid] | |
return false | |
end | |
end | |
return true | |
end | |
def get_crowdflower_results(jobId, limit) | |
page_size = 30 | |
all_u_judgments = [] | |
(1..1000).each do |page| | |
job = CrowdFlower::Job.new(jobId) | |
puts "-----" | |
puts page_size | |
puts page | |
judgments = CrowdFlower::Judgment.new(job).all(page, page_size) | |
break if judgments.size == 0 | |
raise("Couldn't load #{page} #{page_size}") if judgments.class != Hash # some kind of error response - we need to handle better | |
u_judgments = judgments.to_a.map { |id, j| crowdflower_judgment_to_u_judgment(id, j) } | |
filtered_u_judgments = u_judgments.select do |u_j| | |
judgment_satisfy_params?(u_j, params) | |
end | |
all_u_judgments += filtered_u_judgments | |
break if all_u_judgments.size >= limit | |
end | |
all_u_judgments | |
end | |
get '/feed' do | |
content_type 'application/xml', :charset => 'utf-8' | |
return "invalid key\n\n" unless params[:key] == "yqNm7FHSwfdRb8nC2653" | |
# were gonna page through these motherfucking results | |
all_u_judgments = [] | |
offset = 0 | |
limit = 10 | |
if (params[:limit]) | |
offset, limit = params[:limit].split(",") | |
offset = offset.to_i | |
limit = limit.to_i | |
end | |
all_u_judgments = get_crowdflower_results(@@jobId, limit) | |
if all_u_judgments.size < limit | |
new_results = get_crowdflower_results(4901, limit) | |
all_u_judgments += new_results | |
end | |
all_u_judgments = all_u_judgments.slice(offset,limit-offset) | |
all_u_judgments ||= [] | |
puts all_u_judgments.length | |
generate_feed(all_u_judgments) | |
end | |
get '/label' do | |
redirect "http://crowdflower.com/judgments/mob/4980" | |
end | |
get '/status' do | |
j = CrowdFlower::Job.new(@@jobId) | |
status = j.status | |
aj = status['all_judgments'] | |
au = status['all_units'] | |
nj = status['needed_judgments'] | |
<<END | |
<html> | |
<body> | |
<p>Number of messages in queue: <b>#{nj}</b> Number of messages classified: <b>#{aj+9856+1714}</b> | |
</body> | |
</html> | |
END | |
end | |
get '/feedold' do | |
offset = 0 | |
limit = 10 | |
if (params[:limit]) | |
offset, limit = params[:limit].split(",") | |
end | |
job = CrowdFlower::Job.new(@@jobId) | |
judgments = CrowdFlower::Judgment.new(job).all(1, limit) | |
judgments = judgments.slice(offset,limit-offset) | |
u_judgments = judgments.to_a.map { |id, j| crowdflower_judgment_to_u_judgment(id, j) } | |
generate_feed(u_judgments) | |
end | |
#puts u_judgments[0].inspect |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment