Created
September 12, 2015 18:35
-
-
Save boxmein/48e8091f29868337f6cd to your computer and use it in GitHub Desktop.
Quick script to scrape all the users of http://thepowdertoy.co.uk . Note: the admin of that site is handing out IP bans left and right!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'net/http' | |
require 'thread' | |
# If a request fails we queue it for later | |
$fail_queue = Queue.new | |
# Total # of accounts | |
accounts = 144903 | |
# Amount of threads and piles of work | |
N = 4 | |
accounts_split = accounts / N | |
# List of all the threads | |
asdf = [] | |
# Input one line with "id id id id" where ids are numbers to skip to this ID per thread | |
# Supports 4 threads only right now :P | |
a = gets | |
unless a == nil | |
qs = a.split.map(&:to_i) | |
qs.collect!.with_index {|x, i| x - accounts_split*i } | |
abort "did not match 4 ints!" unless qs.length == 4 | |
puts "skipping #{qs}" | |
else | |
puts "...not skipping" | |
qs = [0,0,0,0] | |
end | |
# Fetch and dump the JSON into a given file | |
def getjson fh, id | |
begin | |
json = Net::HTTP.get URI("http://powdertoy.co.uk/User.json?ID=#{id}") | |
puts "http://powdertoy.co.uk/User.json?ID=#{id}" | |
fh.puts ("#{id}:" + json) | |
rescue Exception => e | |
$fail_queue.push id | |
puts e.inspect | |
end | |
end | |
# Open our "users" data file | |
# Looks like <id>:<json> | |
# where <id> is an int and <json> is the user.json response for that user ID | |
File.open "users", "a" do |fh| | |
N.times do |i| | |
asdf << Thread.new do |t| | |
start = i * accounts_split + qs[i] | |
puts "started #{i}th thread, skipping #{qs[i]}, counting from #{start} #{accounts_split} times" | |
(accounts_split - qs[i]).times do |j| | |
id = j + start | |
getjson fh, id | |
end | |
puts "stopped #{i}th thread" | |
end | |
end | |
# clear the fail queue too | |
unless $fail_queue.empty? | |
loop do | |
getjson $fail_queue.pop false rescue break | |
end | |
end | |
# join the threads before releasing the file | |
asdf.map(&:join) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment