Created
March 18, 2011 08:29
-
-
Save boctor/875774 to your computer and use it in GitHub Desktop.
This ruby script uses Hpricot to scrape the Freshly Pressed pages on Wordpress.com. It then stores its results as JSON on S3. Here is the related blog post to this Gist: http://idevrecipes.com/?p=260
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby -rubygems | |
require 'open-uri' | |
require 'hpricot' | |
require 'aws/s3' | |
require 'yaml' | |
require 'json' | |
def save_in_s3(key, data, bucket, options) | |
amazon_s3_settings = YAML.load(open(File.join(File.dirname(__FILE__), "amazon_s3.yml"){ |f| f.read })) | |
AWS::S3::Base.establish_connection!(:access_key_id => amazon_s3_settings[:access_key_id], :secret_access_key => amazon_s3_settings[:secret_access_key]) | |
AWS::S3::S3Object.store(key, data, amazon_s3_settings[bucket], options) | |
end | |
num_pages = 10 | |
(1..num_pages).each do |page| | |
picks = Array.new | |
doc = Hpricot(open("http://wordpress.com/?load=editorpicks&fp=#{page}")) | |
doc.search('.pick').each do |pick_element| | |
# Sponsored posts are being skipped until I can figure out a way to get a 320px wide version | |
# of the images used for Sponsored posts | |
next if pick_element.inner_html.include? 'Sponsored Post' | |
pick = Hash.new | |
pick[:url] = pick_element.at('a')['href'] | |
pick[:title] = pick_element.at('.posttitle').inner_text | |
subtitle = pick_element.at('small') | |
pick[:subtitle] = subtitle.inner_text if subtitle | |
picture = pick_element.at('.picture') | |
if picture | |
style_array = picture['style'].split(';').collect{|x| Hash[*x.split(':',2).collect{|a|a.strip}]} | |
style_hash = Hash.new | |
style_array.each {|x| style_hash = style_hash.merge(x)} | |
background_image = style_hash["background-image"] | |
image_url = background_image.match(/url\('([^']+)'\)/)[1] | |
pick[:image] = image_url.gsub('w=223', 'w=320') | |
scale_increase = 1.43497757847534 # 320.0/223.0 | |
pick[:y_offset] = style_hash["background-position"].split[1].sub('px','').to_f * scale_increase | |
else | |
pick[:image] = pick_element.at('img')['src'] | |
end | |
picks << pick | |
end | |
picks << {:next_page => page + 1} unless (page == num_pages - 1) | |
save_in_s3("freshlypressed/#{page}.json", picks.to_json, :wordpress_bucket, {:access => 'public-read'}) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment