Created
August 29, 2015 22:13
-
-
Save briandfoy/60f2dc751720975843af to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/Users/brian/bin/perls/perl5.22.0 | |
use open qw(:std :utf8); | |
use v5.10; | |
use HTTP::Tiny; | |
use Devel::Peek qw(Dump); | |
use Encode qw(decode find_encoding); | |
use HTML::HeadParser; | |
my $ht = HTTP::Tiny->new; | |
# The problem URL | |
my $url = 'http://blogs.perl.org/users/patch/2015/07/noirin-plunkett.html'; | |
my $response = $ht->get( $url ); | |
# blogs.perl.org doesn't return an encoding in the HTTP header | |
my $type_in_header = $response->{headers}{'content-type'}; | |
say "Got type [$type_in_header] from header"; | |
my $charset_in_header; | |
if( ( $charset_in_header ) = $type_in_header =~ /;\s+charset=(\S+)/ ) { | |
say "Charset from HTML is $charset_in_header"; | |
} | |
# so let's look in the <head> | |
my $p = HTML::HeadParser->new; | |
$p->parse( $response->{content} ); | |
my $type = $p->header('Content-Type'); | |
say "Got type [$type] from HTML head"; | |
# <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
my $charset_in_html; | |
if( ( $charset_in_html ) = $type =~ /;\s+charset=(\S+)/ ) { | |
say "Charset from HTML is $charset_in_html"; | |
} | |
# create an encoding object from the first defined value | |
my $encoding = find_encoding( $charset_in_header // $charset_in_html ); | |
die "Could not discover encoding\n" unless $encoding; | |
# Now, take the octets from the raw response and decode them into | |
# its Perl string form | |
my $string = $encoding->decode( $response->{content} ); | |
# Now it should be okay inside Perl | |
say $string; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment