Last active
December 25, 2015 08:59
-
-
Save jonnybarnes/6951138 to your computer and use it in GitHub Desktop.
UTF-8 hex dumper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/*** | |
Single byte chracters range 0000000 - 01111111 -- 00 - 7F | |
Double byte character range 1100000 - 11011111 -- C0 - DF | |
Triple byte character range 1110000 - 11101111 -- E0 - EF | |
Quadru byte character range 1111000 - 11110111 -- F0 - F7 | |
Traili byte character range 1000000 - 10111111 -- 80 - BF | |
***/ | |
include("vendor/autoload.php"); | |
\Patchwork\Utf8\Bootup::initAll(); // Enables the portablity layer and configures PHP for UTF-8 | |
\Patchwork\Utf8\Bootup::filterRequestUri(); // Redirects to an UTF-8 encoded URL if it's not already the case | |
\Patchwork\Utf8\Bootup::filterRequestInputs(); // Normalizes HTTP inputs to UTF-8 NFC | |
header("Content-Type: text/html; charset=utf-8"); | |
define('POST_MAX_LENGTH', 4096); | |
if(!isset($_POST['txt'])) { | |
$txt = "Iñtërnâtiônàlizætiøn"; | |
} elseif(strlen($_POST['txt']) > POST_MAX_LENGTH) { | |
$txt = substr($_POST['txt'], 0, POST_MAX_LENGTH); | |
} else { | |
$txt = $_POST['txt']; | |
} | |
$hex = array_map('strtoupper', array_map('dechex', range(0, 15))); | |
?><!DOCTYPE html> | |
<html> | |
<head> | |
<meta charset="UTF-8"> | |
<title>Hex Dump Text</title> | |
<style> | |
.plain { | |
font-family: monospace; | |
} | |
.space { | |
color: grey; | |
} | |
.one-byte { | |
color: black; | |
} | |
.two-byte { | |
color: green; | |
} | |
.three-byte { | |
color: blue; | |
} | |
.four-byte { | |
color: purple; | |
} | |
.trailing { | |
color: grey; | |
} | |
.invalid { | |
color: red; | |
} | |
</style> | |
</head> | |
<body> | |
<h1>UTF-8 hex inspector</h1> | |
<p>This page allows you to look at the actual byte-values of a | |
<a href="https://en.wikipedia.org/wiki/UTF-8">UTF-8</a> encoded string. UTF-8 is hard. | |
Well, techinically speaking, UTF-8 is quite easy. Properly implementing Unicode is hard. | |
There are various security concerns when implementing UTF-8. One needs to be sure that it is | |
<em>well-formed</em>. A string may not be well-formed UTF-8 for reasons such as containing | |
an invalid byte value. I employ a PHP library called | |
<a href="https://github.com/nicolas-grekas/Patchwork-UTF8">Patchwork-UTF8</a> to help with this. | |
The library checks if the input in the <code>textarea</code> is valid, well-formed UTF-8. If not | |
the string is assumed to be CP-1252 and converted to UTF-8.</p> | |
<textarea maxlength="<?=POST_MAX_LENGTH?>" rows="10" cols="50" name="txt"><?="\n".$txt?></textarea> | |
<input type="submit" value="Hex It!"> | |
</form> | |
<table id="hex"> | |
<thead> | |
<tr> | |
<th></th> | |
<?php foreach($hex as $i) { echo " <th>+{$i}</th>\n"; } ?> | |
<th class="plain"><?=implode('', $hex)?></th> | |
</tr> | |
</thead> | |
<tbody> | |
<?php | |
$offset = 0; | |
$strlen = strlen($txt); | |
while($offset < $strlen && $line = substr($txt, $offset, 0x10)) { | |
echo " <tr>\n <th>0x" . sprintf('%04X', $offset) . "</th>\n "; | |
$linelen = strlen($line); | |
$offset += $linelen; | |
$linenext = substr($txt, $offset, 0x10); | |
$hexpart = ''; | |
$plainpart = ''; | |
for($i = 0; $i < $linelen; $i++) { | |
$class = ''; | |
$ascii = ord($line[$i]); | |
if($ascii < 0x20 || $ascii > 0x7F) { | |
$char = '<span class="non-printable">.</span>'; | |
if($ascii >= 0xF5) { $class = ' class="invalid"'; } | |
elseif($ascii >= 0xF0) { | |
if($i <= 12) { | |
$title = $line[$i] . $line[$i + 1] . $line[$i + 2] . $line[$i + 3]; | |
} elseif($i = 13) { | |
$title = $line[$i] . $line[$i + 1] . $line[$i + 2] . $linenext[0]; | |
} elseif($i = 14) { | |
$title = $line[$i] . $line[$i + 1] . $linenext[0] . $linenext[1]; | |
} elseif($i = 15) { | |
$title = $line[$i] . $linenext[0] . $linenext[1] . $linenext[2]; | |
} | |
$class = ' class="four-byte" title="' . $title . '"'; | |
} | |
elseif($ascii >= 0xE0) { | |
if($i <= 13) { | |
$title = $line[$i] . $line[$i + 1] . $line[$i + 2]; | |
} elseif($i = 14) { | |
$title = $line[$i] . $line[$i + 1] . $linenext[0]; | |
} elseif($i = 15) { | |
$title = $line[$i] . $linenext[0] . $linenext[1]; | |
} | |
$class = ' class="three-byte" title="' . $title . '"'; | |
} | |
elseif($ascii >= 0xC0) { | |
if($i <= 14) { | |
$title = $line[$i] . $line[$i + 1]; | |
} elseif($i = 15) { | |
$title = $line[$i] . $linenext[0]; | |
} | |
$class = ' class="two-byte" title="' . $title . '"'; | |
} | |
elseif($ascii >= 0x80) { | |
$class = ' class="trailing" title="trailing byte"'; | |
} | |
} else { | |
if($ascii == 0x20) { | |
$class = ' class="whitespace" title="space"'; | |
} else { | |
$class = ' class="one-byte" title="' . $line[$i] . '"'; | |
} | |
$char = htmlspecialchars($line[$i]); | |
} | |
if($ascii == 0xC0 || $ascii == 0xC1) { $class = ' class="invalid"'; } | |
if($char == ' ') { $char = '<span class="space">-</span>'; } | |
$hexpart .= "<td{$class}".($i == $linelen-1 ? ((0x10-$linelen) ? (' colspan="'.(0x11-$linelen).'"') : '') : '') . ">" . sprintf('%02X', $ascii) . "</td>\n "; | |
$plainpart .= $char; | |
} | |
echo $hexpart . '<td class="plain">' . $plainpart . "</td>\n </tr>\n"; | |
} | |
?> | |
</tbody> | |
</table> | |
<p>Legend: <span class="four-byte">four-byte</span> <span class="three-byte">three-byte</span> <span class="two-byte">two-byte</span> single-byte <span class="trailing">trailing-byte</span> <span class="invalid">invalid</span></p> | |
<p><a href="https://gist.github.com/jonnybarnes/6951138">source code</a> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment