Skip to content

Instantly share code, notes, and snippets.

@tgittos
Last active August 29, 2015 14:17
Show Gist options
  • Save tgittos/4ab30250bfc682664c2f to your computer and use it in GitHub Desktop.
Save tgittos/4ab30250bfc682664c2f to your computer and use it in GitHub Desktop.
Use Ruby and Watir-Webdriver to scrape pins from a Pinterest page.
#! /usr/bin/env ruby
require 'watir-webdriver'
require 'openssl'
require 'open-uri'
require 'nokogiri'
# bad pinterest, having a bad cert
OpenSSL::SSL::VERIFY_PEER = OpenSSL::SSL::VERIFY_NONE
if ARGV.length < 1
puts "expected url"
exit
elsif ARGV.length < 2
puts "expected download dir"
exit
end
username = '[email protected]'
password = 'password'
url = ARGV.first
dir = ARGV.last
# read the pindb file
puts "reading pindb"
visited_pins = if File.exist?(dir + '/pindb')
Marshal.load File.read(dir + '/pindb')
else
[]
end
browser = Watir::Browser.new
browser.goto url
# login
login_link = browser.a :class => "loginButton"
login_link.exists?
login_link.click
browser.form(:class => "loginForm").wait_until_present
username_field = browser.text_field(:name => "username_or_email")
username_field.exists?
username_field.set username
password_field = browser.text_field(:name => "password")
password_field.exists?
password_field.set password
browser.button(:class => "primary").click
# wait for page to load
browser.div(:class => "profileImage").wait_until_present
# begin scraping
begin
puts "starting to scrape pins"
current_height = nil
while current_height.nil? || last_height <= current_height
current_height = browser.execute_script("document.scrollHeight\n");
pin_links = browser.divs(:class => 'pinHolder').collect(&:links).flatten
pin_cursor = 25
while !pin_links[pin_cursor - 1].nil? && visited_pins.include?(pin_links[pin_cursor - 1].first.attribute_value("href"))
puts "last available pin already downloaded, advancing pin cursor"
pin_cursor += 25
end
# correct the pin_cursor
pin_cursor -= 25
pin_links[pin_cursor..-1].each_with_index do |links, i|
href = links.first.attribute_value "href"
if visited_pins.include?(href)
puts "skipping #{href}, already downloaded"
next
end
begin
puts "fetching pin: #{href.inspect}"
doc = Nokogiri::HTML(open(href.to_s))
puts "searching for pin img tag"
image = doc.css('.PaddedPin .Image img').first
image_url = image["src"]
puts "downloading image #{image_url}"
filename = image_url.split('/').last
open(dir + '/' + filename, 'wb') do |file|
file << open(image_url).read
end
visited_pins << href
rescue => e
"error downloading pin #{href}: #{e.message}"
end
end
puts "scrolling to bottom"
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);\n")
sleep 5
new_height = browser.execute_script("document.scrollHeight\n");
puts "new_height: #{new_height.inspect}, current_height: #{current_height.inspect}"
end
rescue => e
puts "oops! #{e.message} \n #{e.backtrace.join("\n")}"
ensure
puts "saving pindb"
serialized_array = Marshal.dump(visited_pins)
File.open(dir + "/pindb", 'w') {|f| f.write(serialized_array) }
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment