Created
February 14, 2012 06:05
-
-
Save jharjono/1824149 to your computer and use it in GitHub Desktop.
tumblr-scraper.rb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Quick hack - really hacky, untested, and probably breaks a lot | |
require 'rubygems' | |
require 'mechanize' | |
module TumblrScraper | |
class TumblrPost | |
attr_accessor :url, :post_id, :tumblr_id | |
# @param [Nokogiri::XML::Element] post - a DOM element representing a Tumblr div.post | |
def initialize(post) | |
@url = post.search("./a").first.attr("href") | |
@post_id = @url.split("/")[4].to_i | |
@tumblr_id = @url.split("/")[2].split(".").first | |
puts "processing post_id #{@post_id} #{@url}" | |
end | |
end | |
class TumblrImagePost < TumblrPost | |
def initialize(post) | |
@img_url = nil | |
begin | |
super(post) | |
@img_url = post.search("img").first.attr('src') | |
rescue => e | |
puts "Not an image file!" | |
end | |
end | |
def download(destination_dir=Dir.pwd) | |
if @img_url.nil? | |
return | |
end | |
out_fname = File.join(destination_dir, "#{@tumblr_id}_#{@post_id}.jpg") | |
# %x[wget #{@img_url} -O #{out_fname}] | |
puts "Downloaded #{@img_url} as #{out_fname}." | |
end | |
end | |
# Scraper for Tumblr images | |
class ImageScraper | |
def initialize(tumblr_id) | |
@url = "http://#{tumblr_id}.tumblr.com" | |
@agent = Mechanize.new | |
end | |
# @param [Int] until - post ID that we will stop scraping at - note that post ID for a tumblr user monotonically increase with time | |
def scrape(limit, download_dir=Dir.pwd) | |
limit_reached = false | |
url = @url | |
page_num = 1 | |
while not limit_reached | |
page = @agent.get(url + "/page/#{page_num}/") | |
posts = page.search(".post").map {|p| TumblrImagePost.new(p)} | |
posts.each do |post| | |
if post.post_id <= limit | |
limit_reached = true | |
puts "Limit reached at post #{post.post_id} <= limit #{limit}. Aborting scraper..." | |
break | |
else | |
post.download(download_dir) | |
end | |
end | |
if posts.size == 0 | |
# no more pages left | |
limit_reached = true | |
puts "Reached end of archive. Aborting scraper..." | |
break | |
end | |
# all image posts in this page downloaded, going backwards in history | |
page_num += 1 | |
end | |
end | |
end | |
end | |
if __FILE__ == $0 | |
scraper = TumblrScraper::ImageScraper.new("tumblr-id-here") | |
scraper.scrape(1) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment