Created
May 12, 2011 00:42
-
-
Save o-sam-o/967704 to your computer and use it in GitHub Desktop.
Forrst Scraper (Old) : http://forrst.com/posts/Who_to_follow_on_Forrst-gIb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'nokogiri' | |
require 'sqlite3' | |
require 'highline/import' | |
# Login to Forrst if we haven't already done so | |
unless File.exists?('f-cookie') | |
email = ask("Email Address or Username?") | |
password = ask("Password?") { |q| q.echo = 'x' } | |
# The important part of login is we need to maintain a session (done via cookies) and use the hidden form_key which is used to prevent XSS | |
login_form = `curl --cookie-jar f-cookie http://forrst.com/login` | |
login_doc = Nokogiri::HTML(login_form) | |
form_key = login_doc.at_css('input[name=form_key]')['value'] | |
p "Logging in with:" | |
p "Email: #{email} Password: #{'*' * password.length} Form Key: #{form_key}" | |
print `curl -X POST -d "email=#{email}&password=#{password}&form_key=#{form_key}&_frosty=%E2%98%83" --cookie f-cookie --cookie-jar f-cookie -i http://forrst.com/auth` | |
end | |
# Setup the SQLite DB to be used to store forrsters | |
`rm forrsters.db` if File.exists?('forrsters.db') | |
db = SQLite3::Database.new "forrsters.db" | |
db.execute <<-SQL | |
create table forrsters ( | |
username varchar(256), | |
type varchar(256), | |
avatar_url varchar(1024), | |
likes int, | |
comments int, | |
posts int | |
); | |
SQL | |
page_number = 1 | |
found_a_forrster = true | |
while found_a_forrster | |
found_a_forrster = false | |
p "Getting page #{page_number}" | |
forrsters_page = `curl --cookie f-cookie -i http://forrst.com/people/list/recent?page=#{page_number}` | |
forrsters_doc = Nokogiri::HTML(forrsters_page) | |
username, type, avatar, posts, likes, comments = nil | |
# Time to do some screen scraping | |
forrsters_doc.css('.person').each do |person_div| | |
avatar = person_div.at_css('a img')['src'] | |
username = person_div.css('a')[1].text | |
type = person_div.at_css('p').text | |
links = person_div.css('.details a') | |
likes = links[0].text.gsub(/\D/, '').to_i | |
comments = links[1].text.gsub(/\D/, '').to_i | |
posts = links[2].text.gsub(/\D/, '').to_i | |
raise 'Missing stuff' unless avatar && username && likes && comments && posts && type | |
p "Found user #{username} who is a #{type} with #{likes} likes, #{comments} comments and #{posts} posts" | |
db.execute "insert into forrsters (username, type, avatar_url, likes, comments, posts) values ( ?, ?, ?, ?, ?, ? )", [username, type, avatar, likes, comments, posts] | |
found_a_forrster = true | |
username, avatar, posts, likes, type, comments = nil | |
end | |
# Make sure we don't get banned for DOSing | |
p "Sleeping ..." | |
sleep 3 | |
page_number += 1 | |
end | |
p "Finished" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment