Created
November 23, 2010 17:52
-
-
Save jsanti/712183 to your computer and use it in GitHub Desktop.
Repairing broken documents that mix UTF-8 and ISO-8859-1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# | |
# Repairing broken documents that mix UTF-8 and ISO-8859-1 | |
# http://plasmasturm.org/log/416/ | |
# | |
use strict; | |
use warnings; | |
use Encode qw( decode FB_QUIET ); | |
binmode STDIN, ':bytes'; | |
binmode STDOUT, ':encoding(UTF-8)'; | |
my $out; | |
while ( <> ) { | |
$out = ''; | |
while ( length ) { | |
# consume input string up to the first UTF-8 decode error | |
$out .= decode( "utf-8", $_, FB_QUIET ); | |
# consume one character; all octets are valid Latin-1 | |
$out .= decode( "iso-8859-1", substr( $_, 0, 1 ), FB_QUIET ) if length; | |
} | |
print $out; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment