mh-github · December 6, 2012 18:45
diff --git a/rupa-blog-links-v3.pl b/rupa-blog-links-v3.pl
 use strict;
 use LWP::UserAgent;
 use HTTP::Response;
 use URI::Heuristic;
 use HTML::Entities;
 use Text::Unidecode;

 sub get_html {
 	my $raw_url = shift;
 	my $url = URI::Heuristic::uf_urlstr($raw_url);

 	my $ua = LWP::UserAgent->new(); 
 	# bogus referrer to perplex the log analyzers
 	my $response = $ua->get($url, Referer => "http://wizard.yellowbrick.oz");

 	if ($response->is_error()) {
 		exit;
 	}
 	else {
 		return $response->content();
 	}
 }

 sub get_links_in_post {
 	my $url = shift;
 	my $html = get_html($url);
 	my $post;
 	my $publishTime;

 	($publishTime) = $html =~ m{tracking\.publishTime = (.*?)\;}s;
 	printf ("Published Time: %s\n", $publishTime);
 	($post) = $html =~ m{article\s+start(.*?)article\s+end}s
 	or die "Couldn't find post";
 	$post =~ s/insetContent(.*?)div>//s;    # remove unnecessary links to past / relevant posts

 	my $firstTime = 2;  # <ul> and </ul> need to be printed only once each
 	foreach (split /\n/, $post) {
 		next if $_ =~ /Twitter/;
 		while (m{
 					<a\ href="
 					([^\"]+)   # link to doc or external source
 					\">
 					([^<]+)    # link text in the post
 					</a>
 				}gx) {
 					# printf ("\t%s -> %s\n", $2, $1);
 					print "<ul>\n" if (($firstTime==2) && $firstTime--);
 					printf ("<li>%s -> <a href=\"%s\">%s</a></li>\n", $2, $1, $1);
 		}
 	}
 	print "</ul>\n" if (($firstTime==1) && $firstTime--);
 }

 my $base_url = "http://blogs.wsj.com/indiarealtime/tag/Economics-Journal/page/";
 my $counter = 0;
 my $html;
 while (1) {
 	$counter++;
 	my $raw_url = sprintf "%s%i/", $base_url, $counter;
 	$html = get_html($raw_url);

 	foreach (split /\n/, $html) {
 		if (/(postTitle)(.*?)Economics Journal:/) {
 			s/Economics Journal: //;    # I don't know why every post has this prefix
 			m{
 				<a\ href="
 				([^\"]+)  # link to blog = $1 = everything to next quote
 				\">
 				([^<]+)   # blog title = $2 = everything up to </a>
 				</a>
 			}gx;
 			print "<hr/>\n";
 			printf("%s -> <a href=\"%s\">%s</a>\n<br/>\n", unidecode(decode_entities($2)), $1, $1); 
 			get_links_in_post($1);
 		}
 	}
 }
	use strict;
	use LWP::UserAgent;
	use HTTP::Response;
	use URI::Heuristic;
	use HTML::Entities;
	use Text::Unidecode;

	sub get_html {
	my $raw_url = shift;
	my $url = URI::Heuristic::uf_urlstr($raw_url);

	my $ua = LWP::UserAgent->new();
	# bogus referrer to perplex the log analyzers
	my $response = $ua->get($url, Referer => "http://wizard.yellowbrick.oz");

	if ($response->is_error()) {
	exit;
	}
	else {
	return $response->content();
	}
	}

	sub get_links_in_post {
	my $url = shift;
	my $html = get_html($url);
	my $post;
	my $publishTime;

	($publishTime) = $html =~ m{tracking\.publishTime = (.*?)\;}s;
	printf ("Published Time: %s\n", $publishTime);
	($post) = $html =~ m{article\s+start(.*?)article\s+end}s
	or die "Couldn't find post";
	$post =~ s/insetContent(.*?)div>//s; # remove unnecessary links to past / relevant posts

	my $firstTime = 2; # <ul> and </ul> need to be printed only once each
	foreach (split /\n/, $post) {
	next if $_ =~ /Twitter/;
	while (m{
	<a\ href="
	([^\"]+) # link to doc or external source
	\">
	([^<]+) # link text in the post
	</a>
	}gx) {
	# printf ("\t%s -> %s\n", $2, $1);
	print "<ul>\n" if (($firstTime==2) && $firstTime--);
	printf ("<li>%s -> <a href=\"%s\">%s</a></li>\n", $2, $1, $1);
	}
	}
	print "</ul>\n" if (($firstTime==1) && $firstTime--);
	}

	my $base_url = "http://blogs.wsj.com/indiarealtime/tag/Economics-Journal/page/";
	my $counter = 0;
	my $html;
	while (1) {
	$counter++;
	my $raw_url = sprintf "%s%i/", $base_url, $counter;
	$html = get_html($raw_url);

	foreach (split /\n/, $html) {
	if (/(postTitle)(.*?)Economics Journal:/) {
	s/Economics Journal: //; # I don't know why every post has this prefix
	m{
	<a\ href="
	([^\"]+) # link to blog = $1 = everything to next quote
	\">
	([^<]+) # blog title = $2 = everything up to </a>
	</a>
	}gx;
	print "<hr/>\n";
	printf("%s -> <a href=\"%s\">%s</a>\n<br/>\n", unidecode(decode_entities($2)), $1, $1);
	get_links_in_post($1);
	}
	}
	}
No results found