Last active
July 17, 2019 23:18
-
-
Save briandfoy/ba81bab108a0021f1d78e3cbc19c7d63 to your computer and use it in GitHub Desktop.
Turn the list of US National Park sites into JSON
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/Users/brian/bin/perl | |
use v5.10; | |
use open qw(:std :utf8); | |
=head1 NAME | |
national_parks.pl - scrape the National Park sites | |
=head1 SYNOPSIS | |
$ perl national_parks.pl | |
=head1 DESCRIPTION | |
This program scrapes the US National Park site to create a JSON array | |
of each of the National Park sites. | |
=head1 AUTHOR & COPYRIGHT | |
Copyright © 2019, brian d foy <[email protected]> | |
=head1 LICENSE | |
You can use this code under the terms of the Artistic License 2.0. | |
=cut | |
use Mojo::JSON qw(encode_json to_json); | |
use Mojo::URL; | |
use Mojo::UserAgent; | |
use Mojo::Util qw(dumper encode decode trim); | |
my $dom = get_body(); | |
my @items = $dom | |
->find( 'div.collapsible-item' ) | |
->map( \&process_item ) | |
->each; | |
# to_json because I've already encoded the standard filehandles | |
say to_json \@items; | |
sub process_item () { | |
my $dom = $_; | |
my $section = $dom | |
->at( | |
'div.collapsible-item-heading | |
h4.collapsible-item-title | |
a.collapsible-item-title-link | |
' =~ s/\v/ /gr ) | |
->all_text =~ s/\s+\(.+//r | |
; | |
# The stuff on the inside is separated by <br /> tags | |
my $text = $dom->at( 'div.collapsible-item-body' ); | |
my @lines = map { Mojo::DOM->new(trim($_)) } split /<br.*?>/, $text; | |
$lines[0] = Mojo::DOM->new($lines[0]->at('div')->content); | |
my @results; | |
foreach my $dom ( @lines ) { | |
my %hash; | |
# Some things don't have links. In that case, we have to find | |
# other ways to separate the name of the place from the | |
# state it's in | |
if( my $a = $dom->at('a') ) { | |
$hash{link} = Mojo::URL->new( $a->attr('href') )->base( base_link() )->to_abs->to_string; | |
$hash{text} = $a->text; | |
} | |
else { | |
$hash{link} = ''; | |
$hash{text} = $dom->text =~ s/,.*//r; | |
} | |
$hash{state} = $dom->text =~ s/.*,\s+//gr; | |
trim( $hash{state} ); | |
$hash{state} =~ s/\s*,\s*/ and /g; | |
if( $hash{state} =~ /\band\b/ ) { | |
$hash{state} = [ split /\s*and\s*/, $hash{state} ] | |
} | |
push @results, \%hash | |
} | |
{ section => $section, sites => \@results }; | |
} | |
sub base_link { Mojo::URL->new('https://www.nps.gov/aboutus/national-park-system.htm') } | |
sub get_body { | |
my $path = Mojo::File->new("$0.txt"); | |
my $text = do { | |
if( ! -e $path ) { | |
state $rc = require Mojo::UserAgent; | |
state $ua = Mojo::UserAgent->new; | |
my $tx = $ua->get( base_link() ); | |
die "Failed" unless( $tx->result->is_success ); | |
# the body is raw bytes, so don't encode them | |
open my $fh, '>:raw', $path; | |
print {$fh} $tx->result->body; | |
close $fh; | |
$tx->result->body; | |
} | |
else { | |
open my $fh, '<:utf8', $path; | |
do { local $/; <$fh> } | |
} | |
}; | |
Mojo::DOM->new( $text ); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment