Skip to content

Instantly share code, notes, and snippets.

@numeroteca
Created June 22, 2012 05:41
Show Gist options
  • Save numeroteca/2970558 to your computer and use it in GitHub Desktop.
Save numeroteca/2970558 to your computer and use it in GitHub Desktop.
Script by rporres (https://metacpan.org/author/RPORRES) for getting Newspaper names and countries from Kiosko.net
#to use it write "$ ./parse_kiosko.pl <lang>" where 'lang' is the language: en, es or fr
#it will output a csv. see example at http://brownbag.me:9001/p/pageonex-kiosko-newspaper-names
#!/usr/bin/env perl
use strict;
use warnings;
use utf8;
use LWP::UserAgent;
use HTML::TreeBuilder 5 -weak;
use URI;
use constant LANG => qw(en es fr);
# No stdout buffering
$|++;
# Force UTF8 output
binmode(STDOUT, ":utf8");
# Define which kiosko root page we're taking
@ARGV == 1 or usage();
my ($lang) = @ARGV;
grep { $_ eq $lang } (LANG) or usage();
my $kiosko_url = "http://$lang.kiosko.net";
# Regexps to get countries
my %regexp = (
es => q|Periódicos de (.+?)\. Toda la prensa de hoy|,
en => q|Newspapers in (.+?)\. Today's press covers|,
fr => q|Les Unes des journaux de (.+?)\. Toute la presse d'aujourd'hui|,
);
# top page
my $tree = HTML::TreeBuilder->new_from_url($kiosko_url);
# Get HTML::Entities from top page
# Look for <a> tags with
# - a defined href
# - a title Periodicos
my @a_tags = $tree->look_down(
'_tag', 'a',
sub {
defined $_[0]->attr('href') &&
defined $_[0]->attr('title') && $_[0]->attr('title') =~ /^Periodicos de/
},
);
# Get a sorted unique list of country codes
# Exclude geo links as we will get it afterwards looking into the country pages
my %uniq;
my @country_pages =
grep { ! $uniq{$_}++ }
sort grep { ! m:/geo/: } map { $_->attr('href') } @a_tags;
# Header
print "Country, Country code, Newspaper name, Newspaper code\n";
for my $country_page (@country_pages) {
my $p_uri = URI->new_abs($country_page, $kiosko_url);
my $p_tree = HTML::TreeBuilder->new_from_url($p_uri->as_string);
# Get title of this page to have the country
# Title of pages have the format i.e
# "Periódicos de Argentina. Toda la prensa de hoy. Kiosko.net"
# "Periódicos de R. Dominicana. Toda la prensa de hoy. Kiosko.net"
my ($p_title_tag) = $p_tree->look_down('_tag', 'title');
# Cannot do it this way due to Dominican Republic :(
#my ($country) = (split /\s/, (split /\./, $p_title_tag->as_text)[0])[-1];
my ($country) = ($p_title_tag->as_text =~ qr|$regexp{$lang}|) ? $1 : $p_title_tag->as_text;
my $gen_uri = URI->new_abs('general.html', $p_uri);
my $g_tree = HTML::TreeBuilder->new_from_url($gen_uri->as_string);
# Links of the newspapers are in the a tags having
# class=thcover and the attribute href defined
my @gen_a_tags = $g_tree->look_down(
'_tag', 'a',
sub {
defined $_[0]->attr('class') &&
$_[0]->attr('class') eq 'thcover' &&
defined $_[0]->attr('href')
}
);
for my $a (@gen_a_tags) {
my $n_uri = URI->new_abs($a->attr('href'), $kiosko_url);
my $n_tree = HTML::TreeBuilder->new_from_url($n_uri->as_string);
my ($n_title_tag) = $n_tree->look_down('_tag', 'title');
# Get newspaper name
# Newspaper pages names have the format, i.e
# "Periódico Diario Libre (R. Dominicana). Periódicos de R. Dominicana. Toda la prensa de hoy. Kiosko.net"
my $n_name = $n_title_tag->as_text;
$n_name =~ s/ \(.+//;
$n_name =~ s/[^\s]+\s//;
# Get newspaper code
# newspaper links have the format: /fr/np/presseocean.html
my ($n_code) = ( split /\//, $a->attr('href') )[-1];
$n_code =~ s/\.html//;
# Remove slashes from country_page
$country_page =~ s:/::g;
print join(',', $country, $country_page, $n_name, $n_code), "\n";
}
}
sub usage {
die sprintf "Usage: $0 <%s>\n", join '|', (LANG);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment