Created
March 19, 2011 05:59
-
-
Save cincodenada/877266 to your computer and use it in GitHub Desktop.
This is the script I used when writing my blog post about the XKCD effect (http://commacommacrash.com/2009/12/what-i-learned-from-xkcd-effect.html). A base for all my WWW::Mechanize scripts.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use LWP::Simple; | |
use WWW::Mechanize; | |
use HTTP::Cookies; | |
use Term::ReadKey; | |
use POSIX qw(strftime mktime); | |
my $interneturl = "http://www.google.com"; | |
my $internet = "Google"; | |
my $transcriptionurl = "http://www.ohnorobot.com/transcribe.pl?comicid=apKHvCCc66NMg&url=http:%%2F%%2Fxkcd.com%%2F%d%%2F"; | |
my $baseurl = "http://www.xkcd.com"; | |
my $comicurl = "$baseurl/%d/"; | |
my $imageviewer = "gnome-open"; | |
my $urlviewer = "xdg-open"; | |
my $tempfile = "temp"; | |
my $archiveurl = "archive"; | |
my $trendsurl = "http://www.google.com/trends?q=%s&date=%s&geo=all"; | |
my $hottrendsurl = "http://www.google.com/trends/hottrends?q=%s&date=%s"; | |
my $csvurl = "http://www.google.com/trends/viz?q=%s&date=%s&geo=all&graph=all_csv&scale=%d"; | |
my @scalenums = (0,1); | |
$numvs = length($ARGV[1]) - 1; | |
print "Intializing...\n" if $numvs > 0; | |
#Make sure our urls are correct | |
print "Checking internet connection...\n" if $numvs > 0; | |
unless(head($interneturl)) { print "WARNING: Either you're not connected to the internet, or $internet is down. Probably the former. Proceeding...\n"; } | |
print "Checking OhNoRobot...\n" if $numvs > 0; | |
unless(head(sprintf($transcriptionurl, 1))) { die "OhNoRobot config is incorrect! Correct \$transcriptionurl template."; } | |
print "Checking xkcd...\n" if $numvs > 0; | |
unless(head(sprintf($comicurl, 1))) { die "xkcd config is incorrect! Correct \$comicurl or \$baseurl template."; } | |
print "Downloading archive page...\n" if $numvs > 0; | |
unless($archivepage = get("$baseurl/$archiveurl")) { print "WARNING: Couldn't get archive page! Dates and titles will be unavailable.\n"; } | |
$trendsbot = WWW::Mechanize->new(); | |
$cookiejar = HTTP::Cookies->new(); | |
#Try to load cookies... | |
if(-e 'cookies.dat') { | |
#Load the cookie jar and init it into trendsbot | |
$cookiejar->load('cookies.dat'); | |
$trendsbot->cookie_jar($cookiejar); | |
} | |
else { | |
#Just load the empty cookie jar | |
$trendsbot->cookie_jar($cookiejar); | |
#Login if no cookies | |
print "Google login?"; | |
$login = <STDIN>; | |
chomp($login); | |
print "Google password?"; | |
ReadMode 2; | |
$pass = <STDIN>; | |
chomp($pass); | |
ReadMode 0; | |
print "\n"; | |
#Log in to Google Trends | |
$trendsbot->get('http://www.google.com/accounts/ServiceLogin?service=trends'); | |
$trendsbot->form_id('gaia_loginform'); | |
$trendsbot->field('Email',$login); | |
$trendsbot->field('Passwd',$pass); | |
$trendsbot->click(); | |
#Save cookies for next time | |
$cookiejar->save('cookies.dat'); | |
} | |
#Grab the latest comic number | |
$latestcomic = get($baseurl); | |
if($latestcomic =~ /Permanent link to this comic: http:\/\/xkcd\.com\/(\d+)\//) { | |
$latestnum = $1; | |
} | |
my $start, $end; | |
if($ARGV[0] eq '-r') { | |
print "Starting comic [$latestnum]?"; | |
$start = <STDIN>; | |
chomp($start); | |
unless($start) { $start = $latestnum; } | |
print "Ending comic [1]?"; | |
$end = <STDIN>; | |
chomp($end); | |
unless($end) { $end = 1; } | |
$inc = -1; | |
} | |
else { | |
print "Starting comic [1]?"; | |
$start = <STDIN>; | |
chomp($start); | |
unless($start) { $start = 1; } | |
print "Ending comic [$latestnum]?"; | |
$end = <STDIN>; | |
chomp($end); | |
unless($end) { $end = $latestnum; } | |
$inc = 1; | |
} | |
#Initialize the log file if it doesn't exist | |
unless(-e 'xkcdeffect.csv') { | |
$pubstr = ''; | |
for($i=-2;$i<=2;$i++) { | |
foreach(@scalenums) { | |
$pubstr .= sprintf("Pubdate%+d (scale=%d),",$i,$_); | |
} | |
} | |
open (TSV, '>xkcdeffect.csv') or die "Couldn't open the log file! Something is very wrong.\n"; | |
print TSV "Comic Number,Comic Date,Comic Title,Trends URL,Keyphrase," . $pubstr . "Hotness Rating\n"; | |
close (TSV); | |
} | |
print "$start to $end\n" if $numvs > 0; | |
#Everything looks fine, plow ahead... | |
for(my $comicnum=$start; $comicnum != ($end + $inc); $comicnum+=$inc) { | |
$transpage = sprintf($transcriptionurl, $comicnum); | |
$transcription = get($transpage); | |
if($archivepage =~ /<a href="\/$comicnum\/" title="([^"]+)">([^<]+)<\/a>/) { | |
$comicdate = $1; | |
$comictitle = $2; | |
} | |
else { | |
$comicdate = "<No Date Found>"; | |
$comictitle = "<No Title Found>"; | |
} | |
if($comicdate =~ /(\d+)-(\d+)-(\d+)/) { | |
($year, $month, $day) = ($1, $2, $3); | |
$gtyear = $year; | |
$gtyear =~ s/^20/1/; | |
$gtyear =~ s/^19//; | |
#Zero-pad the month | |
$month = sprintf("%02d",$month); | |
} | |
print "-"x20 . "\n"; | |
print "Comic #$comicnum: $comictitle (" . strftime('%a',0,0,0,$day,$month-1,$gtyear) . " $comicdate)\n"; | |
if($transcription =~ /Here's the transcription for this comic!<p>(.*?)<\/p>/s or $transcription =~ /<textarea[^>]*>([^<]+)<\/(textarea)>/) { | |
$text = $1; | |
$text =~ s/\[\[.*?\]\]//gs; #Eliminate descriptions | |
if($2 ne 'textarea') { | |
$text =~ s/[\r\n\t]//gs; #Eliminate HTML Whitespace | |
$text =~ s/(<br>)+/\n/g; #Insert newlines for <br>s | |
} | |
print "$text\n"; | |
print "Do you want to view the comic image [y/N/q]?"; | |
$doimage = <STDIN>; | |
chomp($doimage); | |
unless($doimage) { $doimage = 'N'; } | |
} | |
else { | |
print "Could not download text" . ($numvs > 0 ? " (tried $transpage)" : '') . "!\nDo you want to view the comic image [Y/n/q]?"; | |
$doimage = <STDIN>; | |
chomp($doimage); | |
unless($doimage) { $doimage = 'Y'; } | |
} | |
print "\n"; | |
if(uc($doimage) eq 'Q') { last; } | |
$viewerr = 0; | |
if(uc(substr($doimage,0,1)) eq 'Y') { | |
$comicpage = get(sprintf($comicurl, $comicnum)); | |
if($comicpage =~ /<br\/>[\r\n]+<br\/>[\r\n]+<img src\="([^"]+)" title\="([^"]+)" alt\="([^"]+)" \/>/) { | |
$imgurl = $1; | |
$imgtitle = $2; | |
$imgalt = $3; | |
if($imgurl =~ /\/([^\/]*)\.(\w+)$/) { | |
if($tempfile eq '') { | |
$filename = "$1.$2"; | |
} | |
else { | |
$filename = "$tempfile.$2"; | |
} | |
print "Fetching $imgurl to $filename...\n" if $numvs > 1; | |
print "Alt text: $imgtitle\n"; | |
getstore($imgurl, $filename); | |
system $imageviewer,($filename); | |
} | |
else { | |
print "Gack! Couldn't parse the image url!\n"; | |
$viewerr = 1; | |
} | |
} | |
else { | |
print "Gack! Couldn't parse the xkcd page!\n"; | |
$viewerr = 1; | |
} | |
} | |
if($viewerr) { | |
print "Open xckd page in browser? [Y/n]?"; | |
$browse = <STDIN>; | |
chomp($browse); | |
unless($browse) { $browse = 'Y'; } | |
if(uc(substr($browse,0,1)) eq 'Y') { | |
system $urlviewer, (sprintf($comicurl, $comicnum)); | |
print "Opening " . sprintf($comicurl, $comicnum) . "\n"; | |
} | |
} | |
@phraselist = (); | |
while(true) { | |
print "Search phrase?"; | |
$phrase = <STDIN>; | |
chomp($phrase); | |
if($phrase) { | |
$urlphrase = $phrase; $urlphrase =~ s/ /\+/g; | |
$urldate = "$year-$month"; | |
$prevdate = strftime("%Y-%m",0,0,0,0,$month - 1, $gtyear); | |
$nextdate = strftime("%Y-%m",0,0,0,0,$month + 1, $gtyear); | |
$urldate =~ s/-0/-/; | |
$prevdate =~ s/-0/-/; | |
$nextdate =~ s/-0/-/; | |
$trendspage = sprintf($trendsurl, $urlphrase, $urldate); | |
print "\n" . "#" x 20 . "\n" if $numvs > 0; | |
if(strftime("%m",0,0,0,$day-2,$month-1,$gtyear) != $month) { | |
print "NOTE: Downloading previous month\n" if $numvs > 0; | |
@prevcsv = (); @prevfile = (); | |
foreach(@scalenums) { | |
$curscale = $_; | |
$prevfile[$curscale] = sprintf($csvurl, $urlphrase, $prevdate, $curscale); | |
print "Getting data from " . $prevfile[$curscale] . "...\n" if $numvs > 0; | |
$trendsbot->get($prevfile[$curscale]); | |
$prevcsv[$curscale] = $trendsbot->content(); | |
} | |
} | |
@curcsv = (); @curfile = (); | |
foreach(@scalenums) { | |
$curscale = $_; | |
$curfile[$curscale] = sprintf($csvurl, $urlphrase, $urldate, $curscale); | |
print "Getting data from " . $curfile[$curscale] . "...\n" if $numvs > 0; | |
$trendsbot->get($curfile[$curscale]); | |
$curcsv[$curscale] = $trendsbot->content(); | |
} | |
if(strftime("%m",0,0,0,$day+2,$month-1,$gtyear) != $month) { | |
"NOTE: Downloading next month\n"; | |
@nextcsv = (); @nextfile = (); | |
foreach(@scalenums) { | |
$curscale = $_; | |
$nextfile[$curscale] = sprintf($csvurl, $urlphrase, $nextdate, $curscale); | |
print "Getting data from " . $nextfile[$curscale] . "...\n" if $numvs > 1; | |
$trendsbot->get($nextfile[$curscale]); | |
$nextcsv[$curscale] = $trendsbot->content(); | |
} | |
} | |
@indexlist = (); | |
#Grab the numbers! | |
for($i=-2; $i <= 2; $i++) { | |
#Determine the needed month's CSV | |
$monthval = strftime("%Y.%m",0,0,0,$day+$i,$month-1,$gtyear); | |
print "$monthval/$year.$month" if $numvs > 1; | |
if($monthval == "$year.$month") { | |
@csvlist = @curcsv; | |
} | |
elsif($monthval < "$year.$month") { | |
@csvlist = @prevcsv; | |
} | |
elsif($monthval > "$year.$month") { | |
@csvlist = @nextcsv; | |
} | |
$gtdate = strftime("%b %e %Y",0,0,0,$day+$i,$month-1,$gtyear); | |
$gtdate =~ s/ / /g; #Get rid of the extra space padding from strftime | |
foreach(@csvlist) { | |
$csv = $_; | |
if($csv =~ /$gtdate, ([\d\.]+)/) { | |
push(@indexlist, $1); | |
print $csv if $numvs > 2; | |
print "Index on $gtdate: $1\n" if $numvs > 1; | |
} | |
else { | |
push(@indexlist, 0); | |
print "Error grabbing index for $gtdate.\n" if $numvs > 1; | |
} | |
} | |
} | |
print "#" x 20 . "\n\n" if $numvs > 0; | |
$hoturl = sprintf($hottrendsurl, $urlphrase, $comicdate); | |
$hottrends = get($hoturl); | |
if($hottrends =~ /Hotness: <font[^>]+>([^<]+)<\/font>/) { | |
$washot = $1; | |
print "!" x 20 . "\n"; | |
print "This term was $1! $hoturl\n"; | |
print "!" x 20 . "\n"; | |
} | |
else { $washot = 'not'; } | |
$defval = 0; | |
foreach(@indexlist) { if($_ > 0) { $defval = 1; last; } } | |
print "Index for surrounding 5 days: " . join(' ',@indexlist[0..3]) . ' >' . join('/', @indexlist[4,5]) . '< ' . join(' ',@indexlist[6..9]) . " ($trendspage)\n"; | |
if($defval) { | |
#Take a guess as to whether these are good results or not | |
$probgood = 0; | |
if($indexlist[2] < $indexlist[4] && $indexlist[3] < $indexlist[5]) { | |
$probgood = 1; | |
} | |
print "Save " . ($probgood ? '[Y/n]' : '[y/N]') . "?"; | |
$doimage = <STDIN>; | |
chomp($doimage); | |
unless($doimage) { $doimage = ($probgood ? 'Y' : 'N'); } | |
open (TSV, '>>xkcdeffect.csv'); | |
if(uc(substr($doimage,0,1)) eq 'Y') { | |
$writeline = sprintf("%d,%s,\"%s\",\"%s\",%s,%s,%s\n", | |
$comicnum, $comicdate, $comictitle, $trendspage, | |
$phrase,join(',',@indexlist),$washot | |
); | |
print "Added to log file." if $numvs > 1; | |
print TSV $writeline; | |
} | |
close(TSV); | |
} else { | |
print "No data avaliable, skipping...\n"; | |
} | |
} | |
else { last; } | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment