Created
August 30, 2015 16:53
-
-
Save briandfoy/9caa576adc8e15e48b13 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/Users/brian/bin/perls/perl5.22.0 | |
use open qw(:std :utf8); | |
use v5.10; | |
use Mojo::UserAgent; | |
use Devel::Peek qw(Dump); | |
use Encode qw(decode find_encoding); | |
use HTML::HeadParser; | |
my $ua = Mojo::UserAgent->new; | |
# The problem URL | |
my $url = 'http://blogs.perl.org/users/patch/2015/07/noirin-plunkett.html'; | |
my $tx = $ua->get( $url ); | |
my $res = $tx->res; | |
# blogs.perl.org doesn't return an encoding in the HTTP header | |
my $type_in_header = $res->headers->header('content-type'); | |
say "Got type [$type_in_header] from header"; | |
my $charset_in_header; | |
if( ( $charset_in_header ) = $type_in_header =~ /;\s+charset=(\S+)/ ) { | |
say "Charset from header is $charset_in_header"; | |
} | |
my $ct = $res->content->charset; | |
say "Charset from method before meta inspection is [$ct]"; | |
# so let's look in the <head> | |
process_meta( $tx ); | |
$ct = $res->content->charset; | |
say "Charset from method after meta inspection is [$ct]"; | |
# create an encoding object from the first defined value | |
my $encoding = find_encoding( $ct ); | |
die "Could not discover encoding\n" unless $encoding; | |
# Now, take the octets from the raw response and decode them into | |
# its Perl string form. ->body is the raw octets. | |
my $decoded_body = $encoding->decode( $res->body ); | |
# Now it should be okay inside Perl | |
# Note that ->text assumes UTF-8 unless there's something else | |
say $res->text; | |
sub process_meta { | |
my( $tx ) = @_; | |
my $headers = $tx->res->headers; | |
my( %meta ) = | |
$tx | |
->res | |
->dom | |
->find( 'head meta[http-equiv]' ) | |
->map( sub { | |
$headers->header( $_->{'http-equiv'}, $_->{'content'} ); | |
} ); | |
$tx; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment