Created
July 2, 2011 19:59
-
-
Save afgomez/1061589 to your computer and use it in GitHub Desktop.
Measures distance of any wikipedia article to Philosophy. Based on http://xkcd.org/903/ alt text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# WikiStance. A Wikipedia distance meter. | |
# Based on the alt text of http://xkcd.org/903/ | |
# Gets a Wikipedia URL and measure the distance of this page to the Philosophy article, clicking on links not in parens | |
# neither italics | |
# | |
# Author:: Alejandro Fernández (mailto:[email protected]) | |
# Copyright:: Copyright (c) 2011 Alejandro Fernández | |
# License:: GPL | |
# | |
# =Usage= | |
# | |
# require 'wikistance' | |
# | |
# url = 'http://en.wikipedia.org/wiki/Scrubs_%28TV_series%29' | |
# ws = WikiStance.new(url) | |
# ws.trace # Go through all the pages until we reach Philosophy | |
# ws.distance # => 22 | |
# ws.breadcrumbs # => ["List of characters on Scrubs", "NBC", "United States", ..., "Philosophy"] | |
require 'rubygems' | |
require 'mechanize' | |
class WikiStance | |
attr_reader :title, :breadcrumbs | |
def initialize(url) | |
if url =~ /^http:\/\/en\.wikipedia\.org\/wiki\/(.*?)/ | |
@url = url | |
# Wikipedia returns 403 with the default user agent | |
@agent = Mechanize.new | |
@agent.user_agent_alias = 'Mac Safari' | |
self.reset | |
else | |
raise ArgumentError, "You should use a valid wikipedia link" | |
end | |
end | |
# Resets the class | |
def reset | |
@page = @agent.get(@url) | |
@breadcrumbs = [] | |
@title = page_title | |
@breadcrumbs << @title | |
true | |
end | |
# Gets the current @page title | |
def page_title | |
@page.at('#firstHeading').text() | |
end | |
# Go through the pages to calculate distance | |
def trace | |
while page_title != 'Philosophy' | |
click_first_link | |
title = page_title | |
# Avoid entering in an infinite loop | |
if @breadcrumbs.include?(title) | |
raise "We are repeating ourselves! We already visited \"#{title}\"" | |
end | |
@breadcrumbs << title | |
end | |
true | |
end | |
def distance | |
# Breadcrumbs hold the initial page. If we start in philosophy the distance should be 0 | |
@breadcrumbs.length - 1 | |
end | |
private | |
def click_first_link | |
first_link = nil | |
# div#bodyContent is where wikipedia shows article's content | |
# The starting text is direct child of div#bodyContent. This way we avoid <p> inside TOCs and other texts. | |
# We also avoid Disambiguation and other wikipedia texts, (which all of them contains links in italics) because | |
# they are in <div> instead of <p> | |
@page.search('#bodyContent > p').each do |p| | |
# Links between parens should not be clicked | |
# I tried using a regex with lookbehind to know if a link has an opening parenthesis before, but ruby doesn't | |
# support them, so I will just remove all text between parens... | |
text = p.to_html.gsub(/\((?:.*?)\)/, '').gsub(/<i>(?:.*?)<\/i>/, '') | |
# ...and then get the first link. | |
first_link = text.match(/<a(?:.*?)href\=\"[^#](.*?)\"(?:.*?)\/a>/) | |
break unless first_link.nil? | |
end | |
raise "Oops! seems that \"#{page_title}\" has no links" if first_link.nil? | |
@page = @page.links_with(:href => /#{first_link[1]}/).first.click | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'wikistance' | |
valid_url = 'http://en.wikipedia.org/wiki/List_of_characters_on_Scrubs' | |
philosophy_url = 'http://en.wikipedia.org/wiki/Philosophy' | |
describe WikiStance do | |
it "should raise error with invalid URL" do | |
lambda { WikiStance.new('wadus') }.should raise_error(ArgumentError) | |
end | |
it "should create a valid object with a valid URL" do | |
wiki = WikiStance.new(valid_url) | |
wiki.should be_instance_of WikiStance | |
end | |
it "should fetch the title of the page" do | |
wiki = WikiStance.new(valid_url) | |
wiki.title.should == "List of characters on Scrubs" | |
end | |
it "should have no distance from Philosophy" do | |
wiki = WikiStance.new(philosophy_url) | |
wiki.trace | |
wiki.distance.should == 0 | |
end | |
it "should have one of distance from Modern Philosophy" do | |
wiki = WikiStance.new('http://en.wikipedia.org/wiki/Modern_philosophy') | |
wiki.trace | |
wiki.distance.should == 1 | |
end | |
it "should have trails for Moder Philosophy" do | |
wiki = WikiStance.new('http://en.wikipedia.org/wiki/Modern_philosophy') | |
wiki.trace | |
wiki.breadcrumbs.should == ['Modern philosophy', 'Philosophy'] | |
end | |
# Greek_language comes back to itself becase of links between parens | |
it "should not repeat pages" do | |
wiki = WikiStance.new('http://en.wikipedia.org/wiki/Greek_language') | |
lambda { wiki.trace }.should_not raise_error(RuntimeError) | |
end | |
# Psychologist first link is an anchor to the same page | |
it "should avoid #anchor links" do | |
wiki = WikiStance.new('http://en.wikipedia.org/wiki/Psychologist') | |
wiki.trace.should be_true | |
wiki.distance.should == 12 | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment