-
-
Save andyjones/6681804 to your computer and use it in GitHub Desktop.
Given a string, print out the codepoints that it currently compromises of
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Given a string, print out the codepoints that it currently compromises of. If | |
# you pass it a bytestring, you will get the bytes. If you pass it a character | |
# string, you will get the characters. This can be helpful when you're not sure | |
# if your terminal is playing around with the output. | |
sub explain { | |
# We will build up the output in $explain | |
my $explain; | |
# Split the first argument in to characters | |
for my $char ( split(//, shift() ) ) { | |
# Take its ordinal | |
my $ord = ord($char); | |
# If it lies outside the printable range | |
if ( $ord > 126 || $ord < 32 ) { | |
# Then append its hex value to our output | |
$explain .= sprintf('\x{%x}', $ord); | |
# Otherwise, just put the character literal in | |
} else { | |
$explain .= $char; | |
} | |
} | |
return $explain; | |
} | |
# Example | |
use Unicode::Normalize; | |
my $brel = "Qui s'enfuit déjà"; # A byte string | |
print explain($brel) . "\n"; # Qui s'enfuit d\x{c3}\x{a9}j\x{c3}\x{a0} | |
utf8::decode( $brel ); # Decode - byte string -> char string | |
print explain($brel) . "\n"; # Qui s'enfuit d\x{e9}j\x{e0} | |
$brel = NFD( $brel ); # Decompose the characters | |
print explain($brel) . "\n"; # Qui s'enfuit de\x{301}ja\x{300} | |
utf8::encode($brel); # Encode - char string -> byte string | |
print explain($brel) . "\n"; # Qui s'enfuit de\x{cc}\x{81}ja\x{cc}\x{80} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment