Skip to content

Instantly share code, notes, and snippets.

@boxmein
Last active August 29, 2015 14:03
Show Gist options
  • Select an option

  • Save boxmein/edf2e0f6ce1b13c9d80d to your computer and use it in GitHub Desktop.

Select an option

Save boxmein/edf2e0f6ce1b13c9d80d to your computer and use it in GitHub Desktop.
Finds out the Adolf number of Wikipedia pages by reverse-crawling through the "what links here?" pages.
# Work in Progress
# AdolfNumberer.rb
# ================
#
# I've got too much time on my hands.
# This is a script that visits the listing of "What Links Here?" of the Adolf Hitler Wikipedia page,
# assigns everything there an Adolf number of 1, then goes through each of those and their backlink
# pages, assigns every one of those an Adolf number of 2, on an epic and eternal search for anything
# past Adolf number 6.
# Inspired by Variation #5 of the Wiki game:
# http://en.wikipedia.org/wiki/Wikipedia:Wiki_Game#Variations
require 'net/http'
require 'json'
DELAY = 2 # seconds
VERBOSE = true
BOT_NAME = "AdolfNumberer"
VERSION = "2.1.4"
MAX_DEPTH = 8
# API url
BASE_URI = URI("http://en.wikipedia.org/w/api.php")
# base query, to clone and add stuff to later
BASE_QUERY = {
"format" => "json",
"action" => "query",
"list" => "backlinks"
}
USER_AGENT = BOT_NAME+"/"+VERSION+" ([email protected])"
$lastrequest = Time.now.to_i - DELAY
# sleep if web request can't be processed right now
def request_delay
timenow = Time.now.to_i
if timenow < $lastrequest + DELAY
puts "applying delay to network requests." if VERBOSE
sleep DELAY
end
$lastrequest = timenow
end
def makeget uri
request_delay
uri = URI(uri) unless uri.is_a? URI
req = Net::HTTP::Get.new uri
req['User-Agent'] = USER_AGENT
res = Net::HTTP.start(uri.hostname, uri.port) do |http|
http.request(req)
end
yield res.body if block_given? and res.is_a? Net::HTTPSuccess
return res.body if res.is_a? Net::HTTPSuccess
end
# next page of backlinks
def next_page_of_backlinks blcontinue
myquery = BASE_QUERY.clone
myquery['blcontinue'] = blcontinue
myuri = BASE_URI.clone
myuri.query = URI.encode_www_form myquery
puts myuri.to_s if VERBOSE
request_delay
makeget (myuri) do |response|
begin
res = JSON.parse response
# error! something went very wrong!
return -2, res['error'] if res['error']
# warning! something went almost wrong!
return -1, res['warnings'] if res['warnings']
# ok, but we have more pages to go, call me more
return 1, res['query']['backlinks'], res['query-continue']['backlinks']['blcontinue'] if res['query-continue']
# ok, that's it, pack it up
return 0, res['query']['backlinks'] if res['query']['backlinks']
rescue JSON::ParserError => ex
puts "error: received JSON was broken!"
puts ex
p response if VERBOSE
end
end
end
# get all backlinks of a page
def get_all_backlinks_of_page bltitle
puts "getting all backlinks of the page #{bltitle}" if VERBOSE
myquery = BASE_QUERY.clone
myquery['bltitle'] = bltitle
myuri = BASE_URI.clone
myuri.query = URI.encode_www_form myquery
puts myuri.to_s if VERBOSE
# keep our backlinks here, return them later
backlinks = []
makeget (myuri) do |response|
puts "received a response!" if VERBOSE
begin
res = JSON.parse response
# we'll get some backlinks, and a blcontinue if applicable
blcontinue = nil
if res['query-continue'] and
res['query-continue']['backlinks'] and
res['query-continue']['backlinks']['blcontinue']
blcontinue = res['query-continue']['backlinks']['blcontinue']
end
# add found backlinks to the long list
if res['query']['backlinks']
backlinks += res['query']['backlinks']
end
begin
response_code, *params = next_page_of_backlinks blcontinue
puts "got more responses with code #{response_code}" if VERBOSE
# p params if VERBOSE
case response_code
when -2
puts "error! something kind of went very wrong. check this out!"
p params[0]
break
when -1
puts "warning! something went wrong-ish. check this out!"
p params[0]
break
when 0
puts "reached the end of the list. we're done here." if VERBOSE
backlinks += params[0]
break
when 1
puts "more to come, but some received" if VERBOSE
# filter out various special namespaces
backlinks += params[0].reject { |each| each['ns'] != 0 }
params[0].each { |each|
puts " #{each['ns']}-#{each['pageid']} #{each['title']}" if VERBOSE
}
blcontinue = params[1]
end
end while true
rescue JSON::ParserError => ex
puts "error: received JSON was broken!"
puts ex
puts response.inspect if VERBOSE
end
end
return backlinks
end
# Page Title => number
$adolf_numbers = {}
$outfile = open "adolfdata.csv", "w"
# get all backlinks, reject the ones already in the hash (none!)
def process article, adolf_level=1
# get all backlinks, and then reject ones already in the adolf level list
# (no loops, sucka)
backlinks = get_all_backlinks_of_page(article).reject { |each|
$adolf_numbers.include? each['title']
}
# add the non-rejected ones into the adolf level list, and present pretty CSV
backlinks.each { |each|
$adolf_numbers[each['title']] ||= adolf_level
$outfile.puts "#{each['ns']},#{each['pageid']},\"#{each['title']}\",#{adolf_level}"
}
$outfile.flush
# and now recurse
backlinks.each { |each|
process each['title'], adolf_level + 1
}
end
begin
process "Adolf_Hitler"
ensure
$outfile.flush
$outfile.close
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment