Last active
August 29, 2015 14:03
-
-
Save boxmein/edf2e0f6ce1b13c9d80d to your computer and use it in GitHub Desktop.
Finds out the Adolf number of Wikipedia pages by reverse-crawling through the "what links here?" pages.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Work in Progress | |
| # AdolfNumberer.rb | |
| # ================ | |
| # | |
| # I've got too much time on my hands. | |
| # This is a script that visits the listing of "What Links Here?" of the Adolf Hitler Wikipedia page, | |
| # assigns everything there an Adolf number of 1, then goes through each of those and their backlink | |
| # pages, assigns every one of those an Adolf number of 2, on an epic and eternal search for anything | |
| # past Adolf number 6. | |
| # Inspired by Variation #5 of the Wiki game: | |
| # http://en.wikipedia.org/wiki/Wikipedia:Wiki_Game#Variations | |
| require 'net/http' | |
| require 'json' | |
| DELAY = 2 # seconds | |
| VERBOSE = true | |
| BOT_NAME = "AdolfNumberer" | |
| VERSION = "2.1.4" | |
| MAX_DEPTH = 8 | |
| # API url | |
| BASE_URI = URI("http://en.wikipedia.org/w/api.php") | |
| # base query, to clone and add stuff to later | |
| BASE_QUERY = { | |
| "format" => "json", | |
| "action" => "query", | |
| "list" => "backlinks" | |
| } | |
| USER_AGENT = BOT_NAME+"/"+VERSION+" ([email protected])" | |
| $lastrequest = Time.now.to_i - DELAY | |
| # sleep if web request can't be processed right now | |
| def request_delay | |
| timenow = Time.now.to_i | |
| if timenow < $lastrequest + DELAY | |
| puts "applying delay to network requests." if VERBOSE | |
| sleep DELAY | |
| end | |
| $lastrequest = timenow | |
| end | |
| def makeget uri | |
| request_delay | |
| uri = URI(uri) unless uri.is_a? URI | |
| req = Net::HTTP::Get.new uri | |
| req['User-Agent'] = USER_AGENT | |
| res = Net::HTTP.start(uri.hostname, uri.port) do |http| | |
| http.request(req) | |
| end | |
| yield res.body if block_given? and res.is_a? Net::HTTPSuccess | |
| return res.body if res.is_a? Net::HTTPSuccess | |
| end | |
| # next page of backlinks | |
| def next_page_of_backlinks blcontinue | |
| myquery = BASE_QUERY.clone | |
| myquery['blcontinue'] = blcontinue | |
| myuri = BASE_URI.clone | |
| myuri.query = URI.encode_www_form myquery | |
| puts myuri.to_s if VERBOSE | |
| request_delay | |
| makeget (myuri) do |response| | |
| begin | |
| res = JSON.parse response | |
| # error! something went very wrong! | |
| return -2, res['error'] if res['error'] | |
| # warning! something went almost wrong! | |
| return -1, res['warnings'] if res['warnings'] | |
| # ok, but we have more pages to go, call me more | |
| return 1, res['query']['backlinks'], res['query-continue']['backlinks']['blcontinue'] if res['query-continue'] | |
| # ok, that's it, pack it up | |
| return 0, res['query']['backlinks'] if res['query']['backlinks'] | |
| rescue JSON::ParserError => ex | |
| puts "error: received JSON was broken!" | |
| puts ex | |
| p response if VERBOSE | |
| end | |
| end | |
| end | |
| # get all backlinks of a page | |
| def get_all_backlinks_of_page bltitle | |
| puts "getting all backlinks of the page #{bltitle}" if VERBOSE | |
| myquery = BASE_QUERY.clone | |
| myquery['bltitle'] = bltitle | |
| myuri = BASE_URI.clone | |
| myuri.query = URI.encode_www_form myquery | |
| puts myuri.to_s if VERBOSE | |
| # keep our backlinks here, return them later | |
| backlinks = [] | |
| makeget (myuri) do |response| | |
| puts "received a response!" if VERBOSE | |
| begin | |
| res = JSON.parse response | |
| # we'll get some backlinks, and a blcontinue if applicable | |
| blcontinue = nil | |
| if res['query-continue'] and | |
| res['query-continue']['backlinks'] and | |
| res['query-continue']['backlinks']['blcontinue'] | |
| blcontinue = res['query-continue']['backlinks']['blcontinue'] | |
| end | |
| # add found backlinks to the long list | |
| if res['query']['backlinks'] | |
| backlinks += res['query']['backlinks'] | |
| end | |
| begin | |
| response_code, *params = next_page_of_backlinks blcontinue | |
| puts "got more responses with code #{response_code}" if VERBOSE | |
| # p params if VERBOSE | |
| case response_code | |
| when -2 | |
| puts "error! something kind of went very wrong. check this out!" | |
| p params[0] | |
| break | |
| when -1 | |
| puts "warning! something went wrong-ish. check this out!" | |
| p params[0] | |
| break | |
| when 0 | |
| puts "reached the end of the list. we're done here." if VERBOSE | |
| backlinks += params[0] | |
| break | |
| when 1 | |
| puts "more to come, but some received" if VERBOSE | |
| # filter out various special namespaces | |
| backlinks += params[0].reject { |each| each['ns'] != 0 } | |
| params[0].each { |each| | |
| puts " #{each['ns']}-#{each['pageid']} #{each['title']}" if VERBOSE | |
| } | |
| blcontinue = params[1] | |
| end | |
| end while true | |
| rescue JSON::ParserError => ex | |
| puts "error: received JSON was broken!" | |
| puts ex | |
| puts response.inspect if VERBOSE | |
| end | |
| end | |
| return backlinks | |
| end | |
| # Page Title => number | |
| $adolf_numbers = {} | |
| $outfile = open "adolfdata.csv", "w" | |
| # get all backlinks, reject the ones already in the hash (none!) | |
| def process article, adolf_level=1 | |
| # get all backlinks, and then reject ones already in the adolf level list | |
| # (no loops, sucka) | |
| backlinks = get_all_backlinks_of_page(article).reject { |each| | |
| $adolf_numbers.include? each['title'] | |
| } | |
| # add the non-rejected ones into the adolf level list, and present pretty CSV | |
| backlinks.each { |each| | |
| $adolf_numbers[each['title']] ||= adolf_level | |
| $outfile.puts "#{each['ns']},#{each['pageid']},\"#{each['title']}\",#{adolf_level}" | |
| } | |
| $outfile.flush | |
| # and now recurse | |
| backlinks.each { |each| | |
| process each['title'], adolf_level + 1 | |
| } | |
| end | |
| begin | |
| process "Adolf_Hitler" | |
| ensure | |
| $outfile.flush | |
| $outfile.close | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment