Last active
August 29, 2015 14:17
-
-
Save tgittos/4ab30250bfc682664c2f to your computer and use it in GitHub Desktop.
Use Ruby and Watir-Webdriver to scrape pins from a Pinterest page.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env ruby | |
require 'watir-webdriver' | |
require 'openssl' | |
require 'open-uri' | |
require 'nokogiri' | |
# bad pinterest, having a bad cert | |
OpenSSL::SSL::VERIFY_PEER = OpenSSL::SSL::VERIFY_NONE | |
if ARGV.length < 1 | |
puts "expected url" | |
exit | |
elsif ARGV.length < 2 | |
puts "expected download dir" | |
exit | |
end | |
username = '[email protected]' | |
password = 'password' | |
url = ARGV.first | |
dir = ARGV.last | |
# read the pindb file | |
puts "reading pindb" | |
visited_pins = if File.exist?(dir + '/pindb') | |
Marshal.load File.read(dir + '/pindb') | |
else | |
[] | |
end | |
browser = Watir::Browser.new | |
browser.goto url | |
# login | |
login_link = browser.a :class => "loginButton" | |
login_link.exists? | |
login_link.click | |
browser.form(:class => "loginForm").wait_until_present | |
username_field = browser.text_field(:name => "username_or_email") | |
username_field.exists? | |
username_field.set username | |
password_field = browser.text_field(:name => "password") | |
password_field.exists? | |
password_field.set password | |
browser.button(:class => "primary").click | |
# wait for page to load | |
browser.div(:class => "profileImage").wait_until_present | |
# begin scraping | |
begin | |
puts "starting to scrape pins" | |
current_height = nil | |
while current_height.nil? || last_height <= current_height | |
current_height = browser.execute_script("document.scrollHeight\n"); | |
pin_links = browser.divs(:class => 'pinHolder').collect(&:links).flatten | |
pin_cursor = 25 | |
while !pin_links[pin_cursor - 1].nil? && visited_pins.include?(pin_links[pin_cursor - 1].first.attribute_value("href")) | |
puts "last available pin already downloaded, advancing pin cursor" | |
pin_cursor += 25 | |
end | |
# correct the pin_cursor | |
pin_cursor -= 25 | |
pin_links[pin_cursor..-1].each_with_index do |links, i| | |
href = links.first.attribute_value "href" | |
if visited_pins.include?(href) | |
puts "skipping #{href}, already downloaded" | |
next | |
end | |
begin | |
puts "fetching pin: #{href.inspect}" | |
doc = Nokogiri::HTML(open(href.to_s)) | |
puts "searching for pin img tag" | |
image = doc.css('.PaddedPin .Image img').first | |
image_url = image["src"] | |
puts "downloading image #{image_url}" | |
filename = image_url.split('/').last | |
open(dir + '/' + filename, 'wb') do |file| | |
file << open(image_url).read | |
end | |
visited_pins << href | |
rescue => e | |
"error downloading pin #{href}: #{e.message}" | |
end | |
end | |
puts "scrolling to bottom" | |
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);\n") | |
sleep 5 | |
new_height = browser.execute_script("document.scrollHeight\n"); | |
puts "new_height: #{new_height.inspect}, current_height: #{current_height.inspect}" | |
end | |
rescue => e | |
puts "oops! #{e.message} \n #{e.backtrace.join("\n")}" | |
ensure | |
puts "saving pindb" | |
serialized_array = Marshal.dump(visited_pins) | |
File.open(dir + "/pindb", 'w') {|f| f.write(serialized_array) } | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment