Created
March 11, 2011 12:39
-
-
Save newtonapple/865841 to your computer and use it in GitHub Desktop.
Generates an HTML with the latest 1000 twitpics from Japan. Useful for getting a glimpse of the Tsunami disaster in Japan. You'll need Ruby 1.9 & Yajl Ruby to run this.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# example: ruby twitpic_extractor.rb > japan.html | |
require 'uri' | |
require 'yajl/http_stream' | |
require 'zlib' | |
class Twitpic | |
class SimpleBloomFilter | |
attr_reader :n, :p, :m, :k, :b, :bit_fields | |
# n: number of total elements | |
# p: false positive probability | |
# b: number of bits per bit-field (use 7 bit to be ASCII compatible, i.e. JavaScript) | |
# m: number of bits (estimate) | |
# k: optimal number of hash (estimate) | |
def initialize( n, p=0.01, b=8 ) | |
@n, @p, @b = n, p, b | |
@m = ( -(@n * Math.log(@p) / (Math.log(2)**2)) ).ceil | |
@k = (0.7 * (@m/@n)).round | |
@bit_fields = "\0" * (@m/@b + 1) | |
end | |
def add( string ) | |
each_hashed_index(string) { |index| set_field(index) } | |
end | |
def include?( string ) | |
each_hashed_index(string) { |index| return false unless get_field(index) } | |
true | |
end | |
private | |
CRC_TABLE = Zlib.crc_table | |
def each_hashed_index( string ) | |
@k.times do |i| | |
# index = Zlib.crc32("#{string}-#{i}", i) % @m | |
index = crc32("#{string}-#{i}", i) % @m | |
yield index | |
end | |
end | |
def set_field( bit_position ) | |
ord = @bit_fields[bit_position / @b].ord | |
@bit_fields[bit_position / @b] = (ord | (1 << (bit_position % @b))).chr | |
end | |
def get_field( bit_position ) | |
@bit_fields[bit_position / @b].ord & (1 << (bit_position % @b)) > 0 | |
end | |
# 32-bit crc | |
# use this instead of Zlib.crc32 to keep compatibility with client side (JavaScript) crc32 | |
def crc32( string="", crc=0 ) | |
if crc > 2**128 - 1 then raise RangeError.new end | |
crc = crc ^ -1 # use -1 instead of 0xffffffff | |
string.each_byte do |byte| | |
index = (crc ^ byte) & 0xff | |
crc = (crc >> 8) ^ CRC_TABLE[index] | |
end | |
crc ^ -1 # 0xffffffff use -1 instead of 0xffffffff | |
end | |
end | |
attr_accessor :uri | |
def initialize(url, pic_size=:thumb) | |
@uri = URI.parse(url) | |
@pic_size = pic_size | |
end | |
def stream(size=1000, wait=5) | |
bloom = SimpleBloomFilter.new(size*2, 0.005) | |
batch = [] | |
got = 0 | |
while got < size | |
urls.each do |pic_url| | |
unless bloom.include?(pic_url) | |
got += 1 | |
return got if got >= size | |
bloom.add(pic_url) | |
yield pic_url | |
end | |
end | |
sleep(wait) | |
end | |
got | |
end | |
def urls(size=@pic_size) | |
results = [] | |
tweets.each do |tweet| | |
if pic_url = url(tweet) | |
results << pic_url | |
end | |
end | |
results | |
end | |
def tweets | |
Yajl::HttpStream.get(@uri)['results'] | |
end | |
def url(tweet, size=@pic_size) | |
if matched = tweet['text'].match(/http:\/\/twitpic.com\/(?<twitpic_id>[a-zA-Z0-9]+)/) | |
"http://twitpic.com/show/#{size}/#{matched[:twitpic_id]}" | |
end | |
end | |
end | |
if __FILE__ == $0 | |
twitpic = Twitpic.new("http://search.twitter.com/search.json?geocode=36.204824%2C138.252924%2C500.0mi&q=twitpic+near%3Ajapan+within%3A500mi&result_type=recent") | |
imgs = [] | |
twitpic.stream(1000,30){|url| imgs << "<img src='#{url}' />" } | |
puts [ | |
"<html>\n", | |
'<title>', 'Japan Twitpic', "</title>\n", | |
'<body>', *imgs, '</body>', | |
'</html>' | |
].join | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment