Created
April 8, 2011 22:00
-
-
Save osfameron/910818 to your computer and use it in GitHub Desktop.
workaround for HTML::Strip with utf8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use strict; use warnings; | |
use HTML::Strip; | |
use Devel::Peek; | |
use Test::More tests => 3; | |
use Encode; | |
use utf8; | |
=head1 Workaround for HTML::Strip with utf8 | |
As discussed with Zefram and ilmari on #london.pm, thanks! | |
L<HTML::Strip> doesn't handle utf8 properly, as it's XS and probably not | |
written to work on characters, only bytes. | |
By default the parse method, when given unicode, returns a bytestring with no | |
unicode markings. | |
A naive way to handle this would be to simply decode_utf8. This works for | |
utf8 strings... but not for extended latin1. | |
A better workaround, suggested by Zefram, is to encode and downgrade first, | |
then decode after. | |
NB: this is just a workaround. Better solutions would be to a) fix HTML::Strip | |
or b) use HTML::Parser instead | |
=cut | |
my @strings = ( | |
{ | |
type => 'ascii', | |
string => 'test', | |
}, | |
{ | |
type => 'unicode', | |
string => "\x{2603}", # snowman | |
}, | |
{ | |
type => 'latin1', | |
string => "L\x{e9}on", | |
} | |
); | |
my $hs = HTML::Strip->new(); | |
for my $record (@strings) { | |
my $string = $record->{string}; | |
my $html = $string . "<br>"; # some sample html to strip | |
# my $stripped = parse_simple( $html ); # fails the unicode test | |
# my $stripped = parse_unicodey( $html ); # fails the latin1 test | |
my $stripped = parse_workaround( $html ); | |
is( $string, $stripped, $record->{type} ); | |
# or do { Dump($string); Dump($stripped) }; | |
} | |
sub parse_simple { | |
my $html = shift; | |
my $stripped = $hs->parse($html); | |
$hs->eof; | |
return $stripped; | |
} | |
sub parse_unicodey { | |
my $html = shift; | |
my $stripped = $hs->parse($html); | |
$hs->eof; | |
return decode_utf8($stripped); | |
} | |
sub parse_workaround { | |
my $html = shift; | |
my $octets = encode_utf8($html); | |
utf8::downgrade($octets); | |
my $stripped = $hs->parse($octets); | |
$hs->eof; | |
return decode_utf8($stripped); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment