Skip to content

Instantly share code, notes, and snippets.

@afgomez
Created July 2, 2011 19:59
Show Gist options
  • Save afgomez/1061589 to your computer and use it in GitHub Desktop.
Save afgomez/1061589 to your computer and use it in GitHub Desktop.
Measures distance of any wikipedia article to Philosophy. Based on http://xkcd.org/903/ alt text
# WikiStance. A Wikipedia distance meter.
# Based on the alt text of http://xkcd.org/903/
# Gets a Wikipedia URL and measure the distance of this page to the Philosophy article, clicking on links not in parens
# neither italics
#
# Author:: Alejandro Fernández (mailto:[email protected])
# Copyright:: Copyright (c) 2011 Alejandro Fernández
# License:: GPL
#
# =Usage=
#
# require 'wikistance'
#
# url = 'http://en.wikipedia.org/wiki/Scrubs_%28TV_series%29'
# ws = WikiStance.new(url)
# ws.trace # Go through all the pages until we reach Philosophy
# ws.distance # => 22
# ws.breadcrumbs # => ["List of characters on Scrubs", "NBC", "United States", ..., "Philosophy"]
require 'rubygems'
require 'mechanize'
class WikiStance
attr_reader :title, :breadcrumbs
def initialize(url)
if url =~ /^http:\/\/en\.wikipedia\.org\/wiki\/(.*?)/
@url = url
# Wikipedia returns 403 with the default user agent
@agent = Mechanize.new
@agent.user_agent_alias = 'Mac Safari'
self.reset
else
raise ArgumentError, "You should use a valid wikipedia link"
end
end
# Resets the class
def reset
@page = @agent.get(@url)
@breadcrumbs = []
@title = page_title
@breadcrumbs << @title
true
end
# Gets the current @page title
def page_title
@page.at('#firstHeading').text()
end
# Go through the pages to calculate distance
def trace
while page_title != 'Philosophy'
click_first_link
title = page_title
# Avoid entering in an infinite loop
if @breadcrumbs.include?(title)
raise "We are repeating ourselves! We already visited \"#{title}\""
end
@breadcrumbs << title
end
true
end
def distance
# Breadcrumbs hold the initial page. If we start in philosophy the distance should be 0
@breadcrumbs.length - 1
end
private
def click_first_link
first_link = nil
# div#bodyContent is where wikipedia shows article's content
# The starting text is direct child of div#bodyContent. This way we avoid <p> inside TOCs and other texts.
# We also avoid Disambiguation and other wikipedia texts, (which all of them contains links in italics) because
# they are in <div> instead of <p>
@page.search('#bodyContent > p').each do |p|
# Links between parens should not be clicked
# I tried using a regex with lookbehind to know if a link has an opening parenthesis before, but ruby doesn't
# support them, so I will just remove all text between parens...
text = p.to_html.gsub(/\((?:.*?)\)/, '').gsub(/<i>(?:.*?)<\/i>/, '')
# ...and then get the first link.
first_link = text.match(/<a(?:.*?)href\=\"[^#](.*?)\"(?:.*?)\/a>/)
break unless first_link.nil?
end
raise "Oops! seems that \"#{page_title}\" has no links" if first_link.nil?
@page = @page.links_with(:href => /#{first_link[1]}/).first.click
end
end
require 'wikistance'
valid_url = 'http://en.wikipedia.org/wiki/List_of_characters_on_Scrubs'
philosophy_url = 'http://en.wikipedia.org/wiki/Philosophy'
describe WikiStance do
it "should raise error with invalid URL" do
lambda { WikiStance.new('wadus') }.should raise_error(ArgumentError)
end
it "should create a valid object with a valid URL" do
wiki = WikiStance.new(valid_url)
wiki.should be_instance_of WikiStance
end
it "should fetch the title of the page" do
wiki = WikiStance.new(valid_url)
wiki.title.should == "List of characters on Scrubs"
end
it "should have no distance from Philosophy" do
wiki = WikiStance.new(philosophy_url)
wiki.trace
wiki.distance.should == 0
end
it "should have one of distance from Modern Philosophy" do
wiki = WikiStance.new('http://en.wikipedia.org/wiki/Modern_philosophy')
wiki.trace
wiki.distance.should == 1
end
it "should have trails for Moder Philosophy" do
wiki = WikiStance.new('http://en.wikipedia.org/wiki/Modern_philosophy')
wiki.trace
wiki.breadcrumbs.should == ['Modern philosophy', 'Philosophy']
end
# Greek_language comes back to itself becase of links between parens
it "should not repeat pages" do
wiki = WikiStance.new('http://en.wikipedia.org/wiki/Greek_language')
lambda { wiki.trace }.should_not raise_error(RuntimeError)
end
# Psychologist first link is an anchor to the same page
it "should avoid #anchor links" do
wiki = WikiStance.new('http://en.wikipedia.org/wiki/Psychologist')
wiki.trace.should be_true
wiki.distance.should == 12
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment