kasei · August 5, 2017 23:31 · kasei · Aug 6, 2017
diff --git a/debian-750946-html5-parser-charset-bug.pl b/debian-750946-html5-parser-charset-bug.pl
 #!/usr/bin/env perl

 # Regarding https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=750946
 # There are at least two issues with the code in the bugreport.
 # The first looks like a bug in HTML::HTML5::Parser (or its
 # dependencies) that is mis-recognizing the charset of the file being
 # opened.
 # 
 # However, the code included in the bugreport also has a bug in it:
 # even with a properly loaded $doc object (as in this case from a
 # string literal), calling `print $doc->toString()` won't work as
 # expected because it returns a byte string and STDOUT has been
 # configured to utf8 encode all output. If STDOUT remains configured
 # with the UTF-8 encoding layer, the bytes must be decoded to a
 # character string before printing to STDOUT:

 use strict;
 use HTML::HTML5::Parser;
 use Encode qw(encode_utf8 decode_utf8);

 use utf8;                            # for the characters in the script.
 binmode STDOUT, ':encoding(UTF-8)';  # for stdout.

 my $parser = HTML::HTML5::Parser->new;
 my $doc	= $parser->parse_string(encode_utf8(<<"END"));
 <?xml version="1.0" encoding="utf-8"?>
 <html xmlns="http://www.w3.org/1999/xhtml">
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <title>title</title>
  </head>
  <body>
    <p>é↓</p>
  </body>
 </html>
 END

 print "Charset: '", $parser->charset($doc), "'\n";
 my $bytes	= $doc->toString();
 my $str	= decode_utf8($bytes);
 print $str;
	#!/usr/bin/env perl

	# Regarding https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=750946
	# There are at least two issues with the code in the bugreport.
	# The first looks like a bug in HTML::HTML5::Parser (or its
	# dependencies) that is mis-recognizing the charset of the file being
	# opened.
	#
	# However, the code included in the bugreport also has a bug in it:
	# even with a properly loaded $doc object (as in this case from a
	# string literal), calling `print $doc->toString()` won't work as
	# expected because it returns a byte string and STDOUT has been
	# configured to utf8 encode all output. If STDOUT remains configured
	# with the UTF-8 encoding layer, the bytes must be decoded to a
	# character string before printing to STDOUT:

	use strict;
	use HTML::HTML5::Parser;
	use Encode qw(encode_utf8 decode_utf8);

	use utf8; # for the characters in the script.
	binmode STDOUT, ':encoding(UTF-8)'; # for stdout.

	my $parser = HTML::HTML5::Parser->new;
	my $doc = $parser->parse_string(encode_utf8(<<"END"));
	<?xml version="1.0" encoding="utf-8"?>
	<html xmlns="http://www.w3.org/1999/xhtml">
	<head>
	<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
	<title>title</title>
	</head>
	<body>
	<p>é↓</p>
	</body>
	</html>
	END

	print "Charset: '", $parser->charset($doc), "'\n";
	my $bytes = $doc->toString();
	my $str = decode_utf8($bytes);
	print $str;