Created
December 6, 2012 18:45
-
-
Save mh-github/4226994 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| use strict; | |
| use LWP::UserAgent; | |
| use HTTP::Response; | |
| use URI::Heuristic; | |
| use HTML::Entities; | |
| use Text::Unidecode; | |
| sub get_html { | |
| my $raw_url = shift; | |
| my $url = URI::Heuristic::uf_urlstr($raw_url); | |
| my $ua = LWP::UserAgent->new(); | |
| # bogus referrer to perplex the log analyzers | |
| my $response = $ua->get($url, Referer => "http://wizard.yellowbrick.oz"); | |
| if ($response->is_error()) { | |
| exit; | |
| } | |
| else { | |
| return $response->content(); | |
| } | |
| } | |
| sub get_links_in_post { | |
| my $url = shift; | |
| my $html = get_html($url); | |
| my $post; | |
| my $publishTime; | |
| ($publishTime) = $html =~ m{tracking\.publishTime = (.*?)\;}s; | |
| printf ("Published Time: %s\n", $publishTime); | |
| ($post) = $html =~ m{article\s+start(.*?)article\s+end}s | |
| or die "Couldn't find post"; | |
| $post =~ s/insetContent(.*?)div>//s; # remove unnecessary links to past / relevant posts | |
| my $firstTime = 2; # <ul> and </ul> need to be printed only once each | |
| foreach (split /\n/, $post) { | |
| next if $_ =~ /Twitter/; | |
| while (m{ | |
| <a\ href=" | |
| ([^\"]+) # link to doc or external source | |
| \"> | |
| ([^<]+) # link text in the post | |
| </a> | |
| }gx) { | |
| # printf ("\t%s -> %s\n", $2, $1); | |
| print "<ul>\n" if (($firstTime==2) && $firstTime--); | |
| printf ("<li>%s -> <a href=\"%s\">%s</a></li>\n", $2, $1, $1); | |
| } | |
| } | |
| print "</ul>\n" if (($firstTime==1) && $firstTime--); | |
| } | |
| my $base_url = "http://blogs.wsj.com/indiarealtime/tag/Economics-Journal/page/"; | |
| my $counter = 0; | |
| my $html; | |
| while (1) { | |
| $counter++; | |
| my $raw_url = sprintf "%s%i/", $base_url, $counter; | |
| $html = get_html($raw_url); | |
| foreach (split /\n/, $html) { | |
| if (/(postTitle)(.*?)Economics Journal:/) { | |
| s/Economics Journal: //; # I don't know why every post has this prefix | |
| m{ | |
| <a\ href=" | |
| ([^\"]+) # link to blog = $1 = everything to next quote | |
| \"> | |
| ([^<]+) # blog title = $2 = everything up to </a> | |
| </a> | |
| }gx; | |
| print "<hr/>\n"; | |
| printf("%s -> <a href=\"%s\">%s</a>\n<br/>\n", unidecode(decode_entities($2)), $1, $1); | |
| get_links_in_post($1); | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment