Skip to content

Instantly share code, notes, and snippets.

@cincodenada
Created March 19, 2011 05:59
Show Gist options
  • Save cincodenada/877266 to your computer and use it in GitHub Desktop.
Save cincodenada/877266 to your computer and use it in GitHub Desktop.
This is the script I used when writing my blog post about the XKCD effect (http://commacommacrash.com/2009/12/what-i-learned-from-xkcd-effect.html). A base for all my WWW::Mechanize scripts.
#!/usr/bin/perl
use LWP::Simple;
use WWW::Mechanize;
use HTTP::Cookies;
use Term::ReadKey;
use POSIX qw(strftime mktime);
my $interneturl = "http://www.google.com";
my $internet = "Google";
my $transcriptionurl = "http://www.ohnorobot.com/transcribe.pl?comicid=apKHvCCc66NMg&url=http:%%2F%%2Fxkcd.com%%2F%d%%2F";
my $baseurl = "http://www.xkcd.com";
my $comicurl = "$baseurl/%d/";
my $imageviewer = "gnome-open";
my $urlviewer = "xdg-open";
my $tempfile = "temp";
my $archiveurl = "archive";
my $trendsurl = "http://www.google.com/trends?q=%s&date=%s&geo=all";
my $hottrendsurl = "http://www.google.com/trends/hottrends?q=%s&date=%s";
my $csvurl = "http://www.google.com/trends/viz?q=%s&date=%s&geo=all&graph=all_csv&scale=%d";
my @scalenums = (0,1);
$numvs = length($ARGV[1]) - 1;
print "Intializing...\n" if $numvs > 0;
#Make sure our urls are correct
print "Checking internet connection...\n" if $numvs > 0;
unless(head($interneturl)) { print "WARNING: Either you're not connected to the internet, or $internet is down. Probably the former. Proceeding...\n"; }
print "Checking OhNoRobot...\n" if $numvs > 0;
unless(head(sprintf($transcriptionurl, 1))) { die "OhNoRobot config is incorrect! Correct \$transcriptionurl template."; }
print "Checking xkcd...\n" if $numvs > 0;
unless(head(sprintf($comicurl, 1))) { die "xkcd config is incorrect! Correct \$comicurl or \$baseurl template."; }
print "Downloading archive page...\n" if $numvs > 0;
unless($archivepage = get("$baseurl/$archiveurl")) { print "WARNING: Couldn't get archive page! Dates and titles will be unavailable.\n"; }
$trendsbot = WWW::Mechanize->new();
$cookiejar = HTTP::Cookies->new();
#Try to load cookies...
if(-e 'cookies.dat') {
#Load the cookie jar and init it into trendsbot
$cookiejar->load('cookies.dat');
$trendsbot->cookie_jar($cookiejar);
}
else {
#Just load the empty cookie jar
$trendsbot->cookie_jar($cookiejar);
#Login if no cookies
print "Google login?";
$login = <STDIN>;
chomp($login);
print "Google password?";
ReadMode 2;
$pass = <STDIN>;
chomp($pass);
ReadMode 0;
print "\n";
#Log in to Google Trends
$trendsbot->get('http://www.google.com/accounts/ServiceLogin?service=trends');
$trendsbot->form_id('gaia_loginform');
$trendsbot->field('Email',$login);
$trendsbot->field('Passwd',$pass);
$trendsbot->click();
#Save cookies for next time
$cookiejar->save('cookies.dat');
}
#Grab the latest comic number
$latestcomic = get($baseurl);
if($latestcomic =~ /Permanent link to this comic: http:\/\/xkcd\.com\/(\d+)\//) {
$latestnum = $1;
}
my $start, $end;
if($ARGV[0] eq '-r') {
print "Starting comic [$latestnum]?";
$start = <STDIN>;
chomp($start);
unless($start) { $start = $latestnum; }
print "Ending comic [1]?";
$end = <STDIN>;
chomp($end);
unless($end) { $end = 1; }
$inc = -1;
}
else {
print "Starting comic [1]?";
$start = <STDIN>;
chomp($start);
unless($start) { $start = 1; }
print "Ending comic [$latestnum]?";
$end = <STDIN>;
chomp($end);
unless($end) { $end = $latestnum; }
$inc = 1;
}
#Initialize the log file if it doesn't exist
unless(-e 'xkcdeffect.csv') {
$pubstr = '';
for($i=-2;$i<=2;$i++) {
foreach(@scalenums) {
$pubstr .= sprintf("Pubdate%+d (scale=%d),",$i,$_);
}
}
open (TSV, '>xkcdeffect.csv') or die "Couldn't open the log file! Something is very wrong.\n";
print TSV "Comic Number,Comic Date,Comic Title,Trends URL,Keyphrase," . $pubstr . "Hotness Rating\n";
close (TSV);
}
print "$start to $end\n" if $numvs > 0;
#Everything looks fine, plow ahead...
for(my $comicnum=$start; $comicnum != ($end + $inc); $comicnum+=$inc) {
$transpage = sprintf($transcriptionurl, $comicnum);
$transcription = get($transpage);
if($archivepage =~ /<a href="\/$comicnum\/" title="([^"]+)">([^<]+)<\/a>/) {
$comicdate = $1;
$comictitle = $2;
}
else {
$comicdate = "<No Date Found>";
$comictitle = "<No Title Found>";
}
if($comicdate =~ /(\d+)-(\d+)-(\d+)/) {
($year, $month, $day) = ($1, $2, $3);
$gtyear = $year;
$gtyear =~ s/^20/1/;
$gtyear =~ s/^19//;
#Zero-pad the month
$month = sprintf("%02d",$month);
}
print "-"x20 . "\n";
print "Comic #$comicnum: $comictitle (" . strftime('%a',0,0,0,$day,$month-1,$gtyear) . " $comicdate)\n";
if($transcription =~ /Here's the transcription for this comic!<p>(.*?)<\/p>/s or $transcription =~ /<textarea[^>]*>([^<]+)<\/(textarea)>/) {
$text = $1;
$text =~ s/\[\[.*?\]\]//gs; #Eliminate descriptions
if($2 ne 'textarea') {
$text =~ s/[\r\n\t]//gs; #Eliminate HTML Whitespace
$text =~ s/(<br>)+/\n/g; #Insert newlines for <br>s
}
print "$text\n";
print "Do you want to view the comic image [y/N/q]?";
$doimage = <STDIN>;
chomp($doimage);
unless($doimage) { $doimage = 'N'; }
}
else {
print "Could not download text" . ($numvs > 0 ? " (tried $transpage)" : '') . "!\nDo you want to view the comic image [Y/n/q]?";
$doimage = <STDIN>;
chomp($doimage);
unless($doimage) { $doimage = 'Y'; }
}
print "\n";
if(uc($doimage) eq 'Q') { last; }
$viewerr = 0;
if(uc(substr($doimage,0,1)) eq 'Y') {
$comicpage = get(sprintf($comicurl, $comicnum));
if($comicpage =~ /<br\/>[\r\n]+<br\/>[\r\n]+<img src\="([^"]+)" title\="([^"]+)" alt\="([^"]+)" \/>/) {
$imgurl = $1;
$imgtitle = $2;
$imgalt = $3;
if($imgurl =~ /\/([^\/]*)\.(\w+)$/) {
if($tempfile eq '') {
$filename = "$1.$2";
}
else {
$filename = "$tempfile.$2";
}
print "Fetching $imgurl to $filename...\n" if $numvs > 1;
print "Alt text: $imgtitle\n";
getstore($imgurl, $filename);
system $imageviewer,($filename);
}
else {
print "Gack! Couldn't parse the image url!\n";
$viewerr = 1;
}
}
else {
print "Gack! Couldn't parse the xkcd page!\n";
$viewerr = 1;
}
}
if($viewerr) {
print "Open xckd page in browser? [Y/n]?";
$browse = <STDIN>;
chomp($browse);
unless($browse) { $browse = 'Y'; }
if(uc(substr($browse,0,1)) eq 'Y') {
system $urlviewer, (sprintf($comicurl, $comicnum));
print "Opening " . sprintf($comicurl, $comicnum) . "\n";
}
}
@phraselist = ();
while(true) {
print "Search phrase?";
$phrase = <STDIN>;
chomp($phrase);
if($phrase) {
$urlphrase = $phrase; $urlphrase =~ s/ /\+/g;
$urldate = "$year-$month";
$prevdate = strftime("%Y-%m",0,0,0,0,$month - 1, $gtyear);
$nextdate = strftime("%Y-%m",0,0,0,0,$month + 1, $gtyear);
$urldate =~ s/-0/-/;
$prevdate =~ s/-0/-/;
$nextdate =~ s/-0/-/;
$trendspage = sprintf($trendsurl, $urlphrase, $urldate);
print "\n" . "#" x 20 . "\n" if $numvs > 0;
if(strftime("%m",0,0,0,$day-2,$month-1,$gtyear) != $month) {
print "NOTE: Downloading previous month\n" if $numvs > 0;
@prevcsv = (); @prevfile = ();
foreach(@scalenums) {
$curscale = $_;
$prevfile[$curscale] = sprintf($csvurl, $urlphrase, $prevdate, $curscale);
print "Getting data from " . $prevfile[$curscale] . "...\n" if $numvs > 0;
$trendsbot->get($prevfile[$curscale]);
$prevcsv[$curscale] = $trendsbot->content();
}
}
@curcsv = (); @curfile = ();
foreach(@scalenums) {
$curscale = $_;
$curfile[$curscale] = sprintf($csvurl, $urlphrase, $urldate, $curscale);
print "Getting data from " . $curfile[$curscale] . "...\n" if $numvs > 0;
$trendsbot->get($curfile[$curscale]);
$curcsv[$curscale] = $trendsbot->content();
}
if(strftime("%m",0,0,0,$day+2,$month-1,$gtyear) != $month) {
"NOTE: Downloading next month\n";
@nextcsv = (); @nextfile = ();
foreach(@scalenums) {
$curscale = $_;
$nextfile[$curscale] = sprintf($csvurl, $urlphrase, $nextdate, $curscale);
print "Getting data from " . $nextfile[$curscale] . "...\n" if $numvs > 1;
$trendsbot->get($nextfile[$curscale]);
$nextcsv[$curscale] = $trendsbot->content();
}
}
@indexlist = ();
#Grab the numbers!
for($i=-2; $i <= 2; $i++) {
#Determine the needed month's CSV
$monthval = strftime("%Y.%m",0,0,0,$day+$i,$month-1,$gtyear);
print "$monthval/$year.$month" if $numvs > 1;
if($monthval == "$year.$month") {
@csvlist = @curcsv;
}
elsif($monthval < "$year.$month") {
@csvlist = @prevcsv;
}
elsif($monthval > "$year.$month") {
@csvlist = @nextcsv;
}
$gtdate = strftime("%b %e %Y",0,0,0,$day+$i,$month-1,$gtyear);
$gtdate =~ s/ / /g; #Get rid of the extra space padding from strftime
foreach(@csvlist) {
$csv = $_;
if($csv =~ /$gtdate, ([\d\.]+)/) {
push(@indexlist, $1);
print $csv if $numvs > 2;
print "Index on $gtdate: $1\n" if $numvs > 1;
}
else {
push(@indexlist, 0);
print "Error grabbing index for $gtdate.\n" if $numvs > 1;
}
}
}
print "#" x 20 . "\n\n" if $numvs > 0;
$hoturl = sprintf($hottrendsurl, $urlphrase, $comicdate);
$hottrends = get($hoturl);
if($hottrends =~ /Hotness: <font[^>]+>([^<]+)<\/font>/) {
$washot = $1;
print "!" x 20 . "\n";
print "This term was $1! $hoturl\n";
print "!" x 20 . "\n";
}
else { $washot = 'not'; }
$defval = 0;
foreach(@indexlist) { if($_ > 0) { $defval = 1; last; } }
print "Index for surrounding 5 days: " . join(' ',@indexlist[0..3]) . ' >' . join('/', @indexlist[4,5]) . '< ' . join(' ',@indexlist[6..9]) . " ($trendspage)\n";
if($defval) {
#Take a guess as to whether these are good results or not
$probgood = 0;
if($indexlist[2] < $indexlist[4] && $indexlist[3] < $indexlist[5]) {
$probgood = 1;
}
print "Save " . ($probgood ? '[Y/n]' : '[y/N]') . "?";
$doimage = <STDIN>;
chomp($doimage);
unless($doimage) { $doimage = ($probgood ? 'Y' : 'N'); }
open (TSV, '>>xkcdeffect.csv');
if(uc(substr($doimage,0,1)) eq 'Y') {
$writeline = sprintf("%d,%s,\"%s\",\"%s\",%s,%s,%s\n",
$comicnum, $comicdate, $comictitle, $trendspage,
$phrase,join(',',@indexlist),$washot
);
print "Added to log file." if $numvs > 1;
print TSV $writeline;
}
close(TSV);
} else {
print "No data avaliable, skipping...\n";
}
}
else { last; }
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment