Skip to content

Instantly share code, notes, and snippets.

@newtonapple
Created March 11, 2011 12:39
Show Gist options
  • Save newtonapple/865841 to your computer and use it in GitHub Desktop.
Save newtonapple/865841 to your computer and use it in GitHub Desktop.
Generates an HTML with the latest 1000 twitpics from Japan. Useful for getting a glimpse of the Tsunami disaster in Japan. You'll need Ruby 1.9 & Yajl Ruby to run this.
#!/usr/bin/env ruby
# example: ruby twitpic_extractor.rb > japan.html
require 'uri'
require 'yajl/http_stream'
require 'zlib'
class Twitpic
class SimpleBloomFilter
attr_reader :n, :p, :m, :k, :b, :bit_fields
# n: number of total elements
# p: false positive probability
# b: number of bits per bit-field (use 7 bit to be ASCII compatible, i.e. JavaScript)
# m: number of bits (estimate)
# k: optimal number of hash (estimate)
def initialize( n, p=0.01, b=8 )
@n, @p, @b = n, p, b
@m = ( -(@n * Math.log(@p) / (Math.log(2)**2)) ).ceil
@k = (0.7 * (@m/@n)).round
@bit_fields = "\0" * (@m/@b + 1)
end
def add( string )
each_hashed_index(string) { |index| set_field(index) }
end
def include?( string )
each_hashed_index(string) { |index| return false unless get_field(index) }
true
end
private
CRC_TABLE = Zlib.crc_table
def each_hashed_index( string )
@k.times do |i|
# index = Zlib.crc32("#{string}-#{i}", i) % @m
index = crc32("#{string}-#{i}", i) % @m
yield index
end
end
def set_field( bit_position )
ord = @bit_fields[bit_position / @b].ord
@bit_fields[bit_position / @b] = (ord | (1 << (bit_position % @b))).chr
end
def get_field( bit_position )
@bit_fields[bit_position / @b].ord & (1 << (bit_position % @b)) > 0
end
# 32-bit crc
# use this instead of Zlib.crc32 to keep compatibility with client side (JavaScript) crc32
def crc32( string="", crc=0 )
if crc > 2**128 - 1 then raise RangeError.new end
crc = crc ^ -1 # use -1 instead of 0xffffffff
string.each_byte do |byte|
index = (crc ^ byte) & 0xff
crc = (crc >> 8) ^ CRC_TABLE[index]
end
crc ^ -1 # 0xffffffff use -1 instead of 0xffffffff
end
end
attr_accessor :uri
def initialize(url, pic_size=:thumb)
@uri = URI.parse(url)
@pic_size = pic_size
end
def stream(size=1000, wait=5)
bloom = SimpleBloomFilter.new(size*2, 0.005)
batch = []
got = 0
while got < size
urls.each do |pic_url|
unless bloom.include?(pic_url)
got += 1
return got if got >= size
bloom.add(pic_url)
yield pic_url
end
end
sleep(wait)
end
got
end
def urls(size=@pic_size)
results = []
tweets.each do |tweet|
if pic_url = url(tweet)
results << pic_url
end
end
results
end
def tweets
Yajl::HttpStream.get(@uri)['results']
end
def url(tweet, size=@pic_size)
if matched = tweet['text'].match(/http:\/\/twitpic.com\/(?<twitpic_id>[a-zA-Z0-9]+)/)
"http://twitpic.com/show/#{size}/#{matched[:twitpic_id]}"
end
end
end
if __FILE__ == $0
twitpic = Twitpic.new("http://search.twitter.com/search.json?geocode=36.204824%2C138.252924%2C500.0mi&q=twitpic+near%3Ajapan+within%3A500mi&result_type=recent")
imgs = []
twitpic.stream(1000,30){|url| imgs << "<img src='#{url}' />" }
puts [
"<html>\n",
'<title>', 'Japan Twitpic', "</title>\n",
'<body>', *imgs, '</body>',
'</html>'
].join
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment