Skip to content

Instantly share code, notes, and snippets.

@chansen
Created June 9, 2013 22:58
Show Gist options
  • Save chansen/5745618 to your computer and use it in GitHub Desktop.
Save chansen/5745618 to your computer and use it in GitHub Desktop.
package unicode;
use strict;
use warnings;
use warnings::register;
use Carp qw[croak cluck];
use Encode qw[];
# from perl.h verified in 5.6.2, 5.8.9, 5.10.1 and blead (2010-02-27)
sub HINT_BLOCK_SCOPE () { 0x00000100 }
sub HINT_LOCALIZE_HH () { 0x00020000 }
# PL_hints (U32)
# Perl 5.6.2 and 5.8.9:
# HINT_UTF8 0x00800000
# Perl 5.10.1 and blead (2010-02-27):
# HINT_NO_AMAGIC 0x01000000
# hijack the remaining (hopefully unused) 7 bits for our purposes.
# I rather not use the hint hash for performance reasons since we
# need fast traversal at runtime.
sub HINT_UNICODE_MASK () { 0xFE000000 }
our @CARP_NOT = ( __PACKAGE__ );
my $DefaultEncoding = 0; # default @Encoding index (0 is UTF-8)
my @Encoding; # Encode::Encoding.pm instances
my %EncodingMap; # name/alias to @Encoding index
my $WarningsOffset = $warnings::Offsets{unicode};
BEGIN {
push @Encoding, Encode::find_encoding('UTF-8');
@EncodingMap{qw(UTF-8 utf-8 utf8)} = (0) x 3;
}
sub import {
shift;
warnings->import(FATAL => 'unicode')
if $^H & HINT_BLOCK_SCOPE;
if (@_) {
my ($name) = @_;
unless (exists $EncodingMap{$name}) {
my $resolved = Encode::resolve_alias($name);
(defined $resolved)
|| croak(qq/unicode: unknown encoding '$name'/);
unless (exists $EncodingMap{$resolved}) {
($#Encoding < 0x7F)
|| croak(q/unicode: reached maximum (0x7F) concurrent compile-time encodings/);
push @Encoding, Encode::find_encoding($resolved);
$EncodingMap{$resolved} = $#Encoding;
}
$EncodingMap{$name} = $EncodingMap{$resolved};
}
my $encoding = $EncodingMap{$name};
$^H &= ~HINT_UNICODE_MASK;
$^H |= $encoding << 25;
}
elsif ($^H & HINT_UNICODE_MASK) {
my $encoding = ($^H & HINT_UNICODE_MASK) >> 25;
unless (exists $Encoding[$encoding]) {
cluck(sprintf q/unicode: hints (0x%.2X) set to an unknown encoding (0x%.2X)/, $^H, $encoding);
$^H &= ~HINT_UNICODE_MASK;
}
}
}
sub unimport {
$^H &= ~HINT_UNICODE_MASK;
}
my $get_encoding = sub {
my ($name) = @_;
my $encoding;
if (exists $EncodingMap{$name}) {
$encoding = $Encoding[$EncodingMap{$name}];
}
else {
$encoding = Encode::find_encoding($name);
}
(defined $encoding)
|| croak(qq/unicode: unknown encoding '$name'/);
return $encoding;
};
my $get_scope_encoding = sub {
my $encoding = $DefaultEncoding;
my $frame = 1;
while (my ($hints) = (caller($frame++))[8]) {
if ($hints & HINT_UNICODE_MASK) {
$encoding = ($hints & HINT_UNICODE_MASK) >> 25;
last;
}
}
(exists $Encoding[$encoding])
|| croak(qq/unicode: panic: no encoding at index '$encoding'/);
return $Encoding[$encoding];
};
my $get_scope_encoding_check = sub {
my $wbits = (caller(1))[9];
my $check = Encode::LEAVE_SRC;
if (vec($wbits, $WarningsOffset, 1)) { # unicode.pm warnings ?
my $is_fatal = vec($wbits, $WarningsOffset + 1, 1);
$check |= ($is_fatal ? Encode::DIE_ON_ERR : Encode::WARN_ON_ERR);
}
return $check;
};
sub decoded {
@_ == 1 || croak(q/Usage: unicode::decoded(string)/);
return (defined $_[0] && (&utf8::is_utf8 || $_[0] !~ /[^\x00-\x7F]/));
}
sub string {
@_ == 1 || croak(q/Usage: unicode::string(string)/);
return !!0 unless &decoded;
# prevent runtime warning (exception on $] < 5.011):
# $] >= 5.011: Unicode non-character 0x%x is illegal for interchange
# $] < 5.011: Malformed UTF-8 character (fatal)
no warnings 'utf8';
return $_[0] !~ / [^\x{00}-\x{10FFFF}] # perls UTF-X encoding supports 2**64
| \p{Surrogate}
| \p{Noncharacter_Code_Point}
/x;
}
*valid = \&string;
sub encoding {
@_ == 0 || croak(q/Usage: unicode::encoding()/);
return &$get_scope_encoding->mime_name;
}
sub decode {
@_ == 1 || @_ == 2 || croak(q/Usage: unicode::decode(octets [, encoding])/);
my $encoding = @_ == 1 ? &$get_scope_encoding : $get_encoding->($_[1]);
$encoding->decode($_[0], &$get_scope_encoding_check);
}
sub encode {
@_ == 1 || @_ == 2 || croak(q/Usage: unicode::encode(string [, encoding])/);
my $encoding = @_ == 1 ? &$get_scope_encoding : $get_encoding->($_[1]);
$encoding->encode($_[0], &$get_scope_encoding_check);
}
1;

NAME

unicode - Perl pragma ..

SYNOPSIS

use unicode 'UTF-8';
use warnings FATAL => 'unicode'; # recommended

my $octets = "\xE2\x98\xBA"; # U+263A WHITE SMILING FACE
my $string = unicode::decode($octets);

say unicode::encoding; # UTF-8

# Lexically scoped encoding
{
    use unicode 'UTF-16';
    say unicode::encoding; # UTF-16
    $octets = unicode::encode($string);
}

# Dynamically scoped encoding
{
    package Foo;
    use unicode;
    sub process {
        my ($string) = @_;
        say "encoding string to " . unicode::encoding;
        return unicode::encode($string);
    }

    package Bar;
    use unicode 'UTF-32';
    sub process {
        my ($string) = @_;
        return Foo->process($string);
    }
}

$octets = Foo->process($string); # UTF-8
$octets = Bar->process($string); # UTF-32

# Explicit encoding
$octets = unicode::encode($string, 'UTF-16');
$string = unicode::decode($octets, 'UTF-16');

# Well-formed?
say "String is a well-formed Unicode string"
  if unicode::valid($string);

say unicode::encode($string);

DESCRIPTION

FUNCTIONS

unicode::decode($octets [, $encoding = SCOPE | 'UTF-8'])

Returns an decoded representation of $octets in $encoding as a character string.

unicode::encode($string [, $encoding = SCOPE | 'UTF-8'])

Returns an encoded representation of $string in $encoding as an octet string.

unicode::encoding([ $encoding = SCOPE | 'UTF-8'])

Returns the canonical encoding name.

unicode::normalize($string [, $form = 'NFC'])

Returns a normalized representation of $string in Unicode normalization $form as a character string.

Valid normalization forms are NFC, NFD, NFKD and NFKC.

unicode::valid($string)

Determine whether or not the supplied $string is a well-formed Unicode string.

A well-formed Unicode string consist of the values U+0000..U+D7FF and U+E000..U+10FFFF excluding the noncharacter values U+nFFFE and U+nFFFF (where n is from 0 to 10^16) and the values U+FDD0..U+FDEF.

ENCODINGS

UTF-8
UTF-16
UTF-16LE
UTF-16BE
UTF-32
UTF-32LE
UTF-32BE
UCS-2
UCS-2BE
UCS-2LE

DIAGNOSTICS

Can't decode %s of type %s

(W unicode)

Can't decode a wide character string

(W unicode)

Can't decode ill-formed %s octet sequence <%s>

(W unicode)

Can't decode incomplete %s code unit <%s>

(W unicode)

Can't encode %s of type %s

(W unicode)

Can't interpret noncharacter code point U+%.4X as an abstract character

(W unicode)

Can't map code point U+%.4X to %s encoding

(W unicode) Code point U+%.4X can't be represented in %s encoding codespace.

Can't map surrogate code point U+%.4X to %s encoding

(W unicode) Surrogate code points are designated only for surrogate code units in the UTF-16 character encoding form. Surrogates consist of code points in the range U+D800 to U+DFFF.

Can't map noncharacter code point U+%.4X to %s encoding

(W unicode) Noncharacters is permanently reserved for internal use and that should never be interchanged. Noncharacters consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 10^16) and the values U+FDD0..U+FDEF.

Can't map restricted code point U-%.8X to %s encoding

(W unicode) Code points in the range U-00110000 to U-7FFFFFFF.

JTC1/SC2/WG2 N 2175 Proposal to restrict the range of code positions to the values up to U-0010FFFF JTC1/SC2/WG2 N 2225 RESOLUTION M38.6 (Restriction of encoding space)

Can't map extended code point %.8X to %s encoding

(W unicode) Code points in the range 2^31 to 2^64-1.

Unknown encoding '%s'

(F)

Unknown Unicode normalization form '%s'

(F)

Usage: unicode::%s

(F) Subroutine %s was called with invalid number of arguments.

Use of uninitialized value %s

(W uninitialized) Please see perldiag.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment