Created
September 11, 2020 10:46
-
-
Save tobyink/92eac64a71095e19917ab3912cde2f59 to your computer and use it in GitHub Desktop.
Web scraper using Zydeco
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env perl | |
| use v5.14; | |
| package Local::App { | |
| use Zydeco; | |
| # Utility function. | |
| # | |
| require HTML::Entities; | |
| sub E ($) { goto \&HTML::Entities::encode_entities_numeric } | |
| use Path::Tiny qw(path); | |
| # A role for creating singleton classes. | |
| # Classes composing the role need to provide a method | |
| # name for the singleton factory. | |
| # | |
| role SingletonFactory ( $methodname ) { | |
| my $instance; | |
| # After the role has been applied to a class... | |
| # | |
| after_apply { | |
| if ( $kind eq 'class' ) { | |
| # Create that factory method. | |
| # | |
| factory {$methodname} () { | |
| $instance ||= $class->new; | |
| } | |
| } | |
| } | |
| } | |
| # Create a subclass of HTTP::Tiny | |
| # which is a singleton. | |
| # | |
| class UA 1.99 { | |
| extends ::HTTP::Tiny; | |
| with SingletonFactory('get_ua'); | |
| # Only difference is that it dies if | |
| # the HTTP request fails. | |
| # | |
| around request ( @args ) { | |
| my $response = $self->$next( @args ); | |
| confess( 'HTTP %d Error: %s', $response->{status}, $response->{reason} ) | |
| unless $response->{success}; | |
| return $response; | |
| } | |
| } | |
| # Create a subclass of HTML::HTML5::Parser | |
| # which is a singleton. | |
| # | |
| class HTML_Parser { | |
| extends ::HTML::HTML5::Parser; | |
| with SingletonFactory('get_html_parser'); | |
| # Could load this anywhere, but convenient | |
| # to do it here. | |
| # | |
| require XML::LibXML::QuerySelector; | |
| } | |
| # Create a subclass of DateTime::Format::Natural | |
| # which is a singleton. | |
| # | |
| class DT_Parser { | |
| extends ::DateTime::Format::Natural; | |
| with SingletonFactory('get_dt_parser'); | |
| } | |
| # This is the web scraping class. | |
| # | |
| class WebPage { | |
| # Required attribute. | |
| # | |
| has url!; | |
| # A bunch of lazy attributes. | |
| # | |
| has ua ( | |
| handles => { 'get' => 'get' } | |
| ) = $self->FACTORY->get_ua; | |
| has html_parser = $self->FACTORY->get_html_parser; | |
| has dt_parser = $self->FACTORY->get_dt_parser; | |
| has content = $self->get( $self->url )->{content}; | |
| has dom ( | |
| handles => { 'q' => 'querySelector' } | |
| ) = $self->html_parser->parse_string( $self->content ); | |
| has title = $self->q('.entry-title')->textContent; | |
| has datetime = do { | |
| my $dt = $self->q('.dateline .date')->textContent; | |
| $self->dt_parser->parse_datetime( $dt ); | |
| }; | |
| has body = do { | |
| my @nodes = $self->q('.entry-body')->childNodes; | |
| join '', map $_->toString, @nodes; | |
| }; | |
| has summary = do { | |
| $self->content =~ m{<dc:description>(.*)</dc:description>}; | |
| $1; | |
| }; | |
| has blog_slug = do { | |
| $self->url =~ m{http://blogs.perl.org/users/([^/]+)/(\d+)/(\d+)/([^/]+).html}; | |
| $1; | |
| }; | |
| has entry_slug = do { | |
| $self->url =~ m{http://blogs.perl.org/users/([^/]+)/(\d+)/(\d+)/([^/]+).html}; | |
| $4; | |
| }; | |
| has blog_id = do { | |
| $self->content =~ m{<input type="hidden" name="IncludeBlogs" value="(\d+)" />}; | |
| $1; | |
| }; | |
| has entry_id = do { | |
| $self->content =~ m{<div class="entry" id="entry-(\d+)">}; | |
| $1; | |
| }; | |
| has id = sprintf( | |
| 'tag:blogs.perl.org,%s:/users/%s//%s.%s', | |
| $self->datetime->year, | |
| $self->blog_slug, | |
| $self->blog_id, | |
| $self->entry_id, | |
| ); | |
| has tags = do { | |
| my @nodes = $self->dom->querySelectorAll('a[rel=tag]'); | |
| [ map $_->textContent, @nodes ]; | |
| }; | |
| has atom_path = path sprintf( | |
| 'sources/bpo/blog/%04d/%02d/%02d/%s/index.atom', | |
| $self->datetime->year, | |
| $self->datetime->month, | |
| $self->datetime->day, | |
| $self->entry_slug, | |
| ); | |
| # Method to output the blog post as Atom. | |
| # | |
| method to_atom () { | |
| my $return = qq{<?xml version="1.0" encoding="UTF-8"?>\n}; | |
| $return .= qq{<entry xml:lang="en" xmlns="http://www.w3.org/2005/Atom">\n}; | |
| $return .= sprintf(qq{<title>%s</title>\n}, E $self->title); | |
| $return .= sprintf(qq{<link rel="alternate" type="text/html" href="%s" />\n}, E $self->url); | |
| $return .= sprintf(qq{<id>%s</id>\n}, E $self->id); | |
| $return .= sprintf(qq{<published>%s</published>\n}, $self->datetime); | |
| $return .= sprintf(qq{<updated>%s</updated>\n}, $self->datetime); | |
| $return .= sprintf(qq{<summary>%s</summary>\n}, E $self->summary); | |
| $return .= sprintf(qq{<category term="%s" scheme="http://www.sixapart.com/ns/types#category" />\n}, E $_) for @{ $self->tags }; | |
| $return .= sprintf(qq{<content type="html" xml:base="http://blogs.perl.org/users/%s/"><![CDATA[%s]]></content>\n}, E $self->blog_slug, $self->body); | |
| $return .= qq{</entry>\n}; | |
| return $return; | |
| } | |
| method process () { | |
| say "Processing ", $self->url; | |
| return if $self->atom_path->is_file; | |
| say "... making ", $self->atom_path; | |
| $self->atom_path->parent->mkpath; | |
| $self->atom_path->spew_utf8( $self->to_atom ); | |
| say "... done"; | |
| return; | |
| } | |
| factory new_page ( Str $url ) { | |
| $class->new( url => $url ); | |
| } | |
| } | |
| } | |
| open my $fh, '<', 'blog-get.html'; | |
| while ( defined ( my $line = <$fh> ) ) { | |
| if ( $line =~ m{(http://blogs.perl.org/[^"]+\.html)} ) { | |
| 'Local::App'->new_page( $1 )->process; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment