Created
April 8, 2011 11:52
-
-
Save rodneyrehm/909692 to your computer and use it in GitHub Desktop.
Reduce (UTF-8) strings to alphanumeric
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* | |
consider decomposing the characters to "capture" more "obscure" characters such as ṩ | |
- http://www.php.net/manual/en/normalizer.normalize.php#92592 | |
*/ | |
/** | |
* Normalize a string to only contain alphanumeric characters and dashes. | |
* | |
* Replace accents by their entities. | |
* Replace everything else by - (dash). | |
* @note mb_internal_charset() must be set to whatever encoding $string had originally | |
* @param string $string String to normalize | |
* @param boolean $trim Trim string to not start/end with a dash and not contain dash sequences | |
* @param array|string $allow List of Characters that are to be ignored while urlifying array( unicode => true ) or string like "$é@" | |
* @param array $replace List of Characters that are to be replaced while urlifying array( unicode => "character" ) | |
* @return string normalized string | |
* @author Christian Kruse <[email protected]> | |
* @author Rodney Rehm <[email protected]> | |
*/ | |
function urlify($string, $trim=true, $allow=null, $replace=null) | |
{ | |
if (!is_string($string)) { | |
throw new Exception('$string must be a string'); | |
} | |
$_replace = array( | |
0xE4 => "\0\0\0\x61\0\0\0\x65", // ä | |
0xC4 => "\0\0\0\x41\0\0\0\x65", // Ä | |
0xF6 => "\0\0\0\x6F\0\0\0\x65", // ö | |
0xD6 => "\0\0\0\x4F\0\0\0\x65", // Ö | |
0xFC => "\0\0\0\x75\0\0\0\x65", // ü | |
0xDC => "\0\0\0\x55\0\0\0\x65", // Ü | |
0xDF => "\0\0\0\x73\0\0\0\x73", // ß | |
0xE6 => "\0\0\0\x61\0\0\0\x65", // æ | |
0xC6 => "\0\0\0\x41\0\0\0\x65", // Æ | |
); | |
if ($replace && is_array($replace)) { | |
foreach ($replace as $k => $v) { | |
$_replace[$k] = mb_convert_encoding($v, "UTF-32BE"); | |
} | |
} | |
if ($allow && is_string($allow)) { | |
$t = mb_convert_encoding($allow, "UTF-32BE"); | |
$t = unpack("N*", $t); | |
$allow = array(); | |
foreach ($t as $k) { | |
$allow[$k] = true; | |
} | |
} elseif ($allow && !is_array($allow)) { | |
$allow = null; | |
} | |
$res = ''; | |
$string = mb_convert_encoding($string, "UTF-32BE"); | |
$unicodes = unpack("N*", $string); | |
$i = -1; | |
foreach ($unicodes as $code) { | |
$i++; | |
if (($code >= 97 && $code <= 122) || ($code >= 65 && $code <= 90) || ($code >= 48 && $code <= 57) || $code == 95 || $code == 45) { | |
// skip normalization for alphanumeric characters [a-zA-Z0-9_-] | |
$res .= mb_substr($string, $i, 1, "UTF-32BE"); | |
} elseif ($allow && isset($allow[$code])) { | |
// skip normalization for allowed characters | |
$res .= mb_substr($string, $i, 1, "UTF-32BE"); | |
} elseif (isset($_replace[$code])) { | |
// replace as defined | |
$res .= $_replace[$code]; | |
} elseif (($code >= 0xC0 && $code <= 0xC6) || ($code >= 0xE0 && $code <= 0xE6) || ($code >= 0x100 && $code <= 0x105)) { | |
$res .= "\0\0\0\x61"; // a | |
} elseif ($code == 0xC7 || $code == 0xE7 || ($code >= 0x106 && $code <= 0x10D)) { | |
$res .= "\0\0\0\x63"; // c | |
} elseif ($code == 0xD0 || ($code >= 0x10E && $code <= 0x111)) { | |
$res .= "\0\0\0\x64"; // d | |
} elseif (($code >= 0xC8 && $code <= 0xCB) || ($code >= 0xE8 && $code <= 0xEB) || ($code >= 0x112 && $code <= 0x11B)) { | |
$res .= "\0\0\0\x65"; // e | |
} elseif (($code >= 0xCC && $code <= 0xCF) || ($code >= 0xEC && $code <= 0xEF)) { | |
$res .= "\0\0\0\x69"; // i | |
} elseif ($code == 0xD1 || $code == 0xF1) { | |
$res .= "\0\0\0\x6E"; // n | |
} elseif (($code >= 0xD2 && $code <= 0xD8) || ($code >= 0xF2 && $code <= 0xF8)) { | |
$res .= "\0\0\0\x6F"; // o | |
} elseif (($code >= 0xD9 && $code <= 0xDB) || ($code >= 0xF9 && $code <= 0xFB)) { | |
$res .= "\0\0\0\x75"; // u | |
} elseif ($code == 0xDD || $code == 0xFD || $code == 0xFF) { | |
$res .= "\0\0\0\x79"; // y | |
} else { | |
$res .= "\0\0\0\x2D"; // - | |
} | |
} | |
if ($trim) { | |
$res = preg_replace('#(\0\0\0\x2D){2,}#', "\0\0\0\x2D", $res); | |
$res = preg_replace('#(^\0\0\0\x2D)|(\0\0\0\x2D$)#', "", $res); | |
} | |
return mb_convert_encoding($res, mb_internal_encoding(), "UTF-32BE"); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
include dirname(__FILE__) . '/urlify.php'; | |
mb_internal_encoding('UTF-8'); | |
$test = array( | |
'hällö wörld' => 'haelloe-woerld', | |
'hällö wörldß' => 'haelloe-woerldss', | |
'hällö wörldß' => 'haelloe-woerldss', | |
'hällö wörld ' => 'haelloe-woerld', | |
'hällö wörld %' => 'haelloe-woerld', | |
'héllò peôplë ÑO?' => 'hello-people-nO', | |
); | |
echo '<pre>'; | |
foreach ($test as $o => $r){ | |
$_r = urlify($o); | |
echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n"; | |
} | |
echo '</pre>'; | |
$test = array( | |
'hällö wörld' => 'haelloe-woerld', | |
'hällö wörldß' => 'haelloe-woerldss', | |
'hällö wörldß' => 'haelloe-woerldss', | |
'hällö wörld ' => 'haelloe-woerld-', | |
'hällö wörld %' => 'haelloe-woerld--', | |
'héllò peôplë ÑO?' => 'hello-people-nO-', | |
); | |
echo '<pre>'; | |
foreach ($test as $o => $r){ | |
$_r = urlify($o, false); | |
echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n"; | |
} | |
echo '</pre>'; | |
$test = array( | |
'hällö wörld' => 'haelloe-woerld', | |
'hällö wörldß' => 'haelloe-woerldss', | |
'hällö wörldß' => 'haelloe-woerldss', | |
'hällö wörld ' => 'haelloe-woerld', | |
'hällö wörld %' => 'haelloe-woerld', | |
'héllò peôplë ÑO?' => 'hello-people-nO', | |
'héllò peôplë ñO?' => 'hello-people-ñO', | |
); | |
echo '<pre>'; | |
foreach ($test as $o => $r){ | |
$_r = urlify($o, true, array(0xF1 => true)); | |
echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n"; | |
} | |
echo '</pre>'; | |
$test = array( | |
'hällö wörld' => 'haelloe-woerld', | |
'hällö wörldß' => 'haelloe-woerldss', | |
'hällö wörldß' => 'haelloe-woerldss', | |
'hällö wörld ' => 'haelloe-woerld', | |
'hällö wörld %' => 'haelloe-woerld', | |
'héllò peôplë ÑO?' => 'hello-people-nO', | |
'héllò peôplë ñO?' => 'hello-people-XXXO', | |
); | |
echo '<pre>'; | |
foreach ($test as $o => $r){ | |
$_r = urlify($o, true, null, array(0xF1 => 'XXX')); | |
echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n"; | |
} | |
echo '</pre>'; | |
$test = array( | |
'hällö wörld' => 'haelloe-woerld', | |
'hällö wörldß' => 'haelloe-woerldss', | |
'hällö wörldß' => 'haelloe-woerldss', | |
'hällö wörld ' => 'haelloe-woerld', | |
'hällö wörld %' => 'haelloe-woerld', | |
'héllò peôplë ÑO?' => 'hello-people-nO?', | |
'héllò peôplë ñO?' => 'hello-people-ñO?', | |
); | |
echo '<pre>'; | |
foreach ($test as $o => $r){ | |
$_r = urlify($o, true, 'ñ?'); | |
echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n"; | |
} | |
echo '</pre>'; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment