Created
May 27, 2011 21:36
-
-
Save luelista/996234 to your computer and use it in GitHub Desktop.
Wikipedia trivia: if you take any article, click on the first link in the article text not in parentheses or italics, and then repeat, you will eventually end up at "Philosophy".
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use Switch; | |
use LWP::Simple qw($ua); | |
# changing User Agent string because Wikipedia.org blocks default LWP::Simple User Agent | |
$ua->agent("WikiBot/0.1"); | |
my $startWord = $ARGV[0]; | |
# argument: name of a wikipedia article, result: name of another wikipedia article | |
sub getFirstLink { my $pageName = shift; | |
my $url = "http://en.wikipedia.org/w/index.php?title=Special:Export&pages=$pageName&offset=1&limit=1&action=submit"; | |
my $str = LWP::Simple::get($url); | |
if ($str =~ m/<text[^>]*>(.*)<\/text>/s) { | |
my $content = $1; | |
# avoiding text from parentheses, boxes and image captions... | |
my ($braces,$brackets,$parens)=(0,0,0); | |
while ($content =~ m/(\[\[|\]\]|\(|\)|{|})/g) { | |
switch ($1) { | |
case '(' { $parens++; } | |
case ')' { $parens--; } | |
case '{' { $braces++; } | |
case '}' { $braces--; } | |
case '[[' { | |
if ($parens == 0 && $braces == 0 && $brackets == 0) { | |
my $pos = pos($content); | |
if (substr($content,$pos,5) ne "File:" && substr($content,$pos,6) ne "Image:") { | |
pos($content)=$pos-2; | |
$content =~ m/\[\[([^\]\|]+)(\|[^\]\|]+)?\]\]/g; | |
return $1; | |
} | |
} | |
$brackets++; | |
} | |
case ']]' { $brackets--; } | |
} | |
} | |
} else { | |
print "ERROR on '$pageName': unable to find any links on this page, or the keyword does not exist!\n\n"; | |
return 0; | |
} | |
} | |
my $myWord = $startWord; | |
my @words = (); | |
while($myWord) { | |
# filling word list for loop detection | |
push @words, $myWord; | |
#printing word | |
print "--> $myWord\n"; | |
$myWord = getFirstLink($myWord); | |
if ( grep { $_ eq $myWord} @words ) { | |
# printing last word and exiting when loop was detected | |
print "--> $myWord\n"; | |
print "##### loop detected!"; | |
push @words, $myWord; | |
last; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment