Created
April 8, 2011 11:52
-
-
Save rodneyrehm/909692 to your computer and use it in GitHub Desktop.
Reduce (UTF-8) strings to alphanumeric
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Normalize a string to only contain alphanumeric characters and dashes. | |
* | |
* Replace accents by their entities. | |
* Replace everything else by - (dash). | |
* @note mb_internal_charset() must be set to whatever encoding $string had originally | |
* @param string $string String to normalize | |
* @param boolean $trim Trim string to not start/end with a dash and not contain dash sequences | |
* @param array $allow List of Characters that are to be ignored while urlifying array( unicode => true ) | |
* @param array $replace List of Characters that are to be replaced while urlifying array( unicode => "character" ) | |
* @return string normalized string | |
* @author Christian Kruse <[email protected]> | |
* @author Rodney Rehm <[email protected]> | |
*/ | |
function urlify($string, $trim=true, $allow=null, $replace=null) | |
{ | |
if (!is_string($string)) { | |
throw new Exception('$string must be a string'); | |
} | |
$_replace = array( | |
0xE4 => 'ae', | |
0xC4 => 'Ae', | |
0xF6 => 'oe', | |
0xD6 => 'Oe', | |
0xFC => 'ue', | |
0xDC => 'Ue', | |
0xDF => 'ss', | |
); | |
if ($replace) { | |
$_replace = array_merge($_replace, $replace); | |
} | |
$res = ''; | |
$encoding = mb_internal_encoding(); | |
$string = mb_convert_encoding($string, "UTF-32BE"); | |
$unicodes = unpack("N*", $string); | |
$i = -1; | |
foreach ($unicodes as $code) { | |
$i++; | |
$character = mb_substr($string, $i, 1, "UTF-32BE"); | |
if (($code >= 97 && $code <= 122) || ($code >= 65 && $code <= 90) || ($code >= 48 && $code <= 57) || $code == 95 || $code == 45) { | |
// skip normalization for alphanumeric characters [a-zA-Z0-9_-] | |
$res .= mb_convert_encoding($character, $encoding, "UTF-32BE"); | |
} elseif ($allow && isset($allow[$code])) { | |
// skip normalization for allowed characters | |
$res .= mb_convert_encoding($character, $encoding, "UTF-32BE"); | |
} elseif (isset($_replace[$code])) { | |
// replace as defined | |
$res .= $_replace[$code]; | |
} elseif (ctype_space($character)) { | |
// replace spaces by dash | |
$res .= '-'; | |
} elseif (($code >= 0xC0 && $code <= 0xC6) || ($code >= 0xE0 && $code <= 0xE6) || ($code >= 0x100 && $code <= 0x105)) { | |
$res .= 'a'; | |
} elseif ($code == 0xC7 || $code == 0xE7 || ($code >= 0x106 && $code <= 0x10D)) { | |
$res .= 'a'; | |
} elseif ($code == 0xD0 || ($code >= 0x10E && $code <= 0x111)) { | |
$res .= 'd'; | |
} elseif (($code >= 0xC8 && $code <= 0xCB) || ($code >= 0xE8 && $code <= 0xEB) || ($code >= 0x112 && $code <= 0x11B)) { | |
$res .= 'e'; | |
} elseif (($code >= 0xCC && $code <= 0xCF) || ($code >= 0xEC && $code <= 0xEF)) { | |
$res .= 'i'; | |
} elseif ($code == 0xD1 || $code == 0xF1) { | |
$res .= 'n'; | |
} elseif (($code >= 0xD2 && $code <= 0xD8) || ($code >= 0xF2 && $code <= 0xF8)) { | |
$res .= 'o'; | |
} elseif (($code >= 0xD9 && $code <= 0xDB) || ($code >= 0xF9 && $code <= 0xFB)) { | |
$res .= 'u'; | |
} elseif ($code == 0xDD || $code == 0xFD || $code == 0xFF) { | |
$res .= 'y'; | |
} else { | |
$res .= '-'; | |
} | |
} | |
if (!$trim) { | |
return $res; | |
} | |
$res = preg_replace('#-{2,}#', '-', $res); | |
return trim($res, '-'); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
include dirname(__FILE__) . '/urlify.php'; | |
mb_internal_encoding('UTF-8'); | |
$test = array( | |
'hällö wörld' => 'haelloe-woerld', | |
'hällö wörldß' => 'haelloe-woerldss', | |
'hällö wörldß' => 'haelloe-woerldss', | |
'hällö wörld ' => 'haelloe-woerld', | |
'hällö wörld %' => 'haelloe-woerld', | |
'héllò peôplë ÑO?' => 'hello-people-nO', | |
); | |
echo '<pre>'; | |
foreach ($test as $o => $r){ | |
$_r = urlify($o); | |
echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n"; | |
} | |
echo '</pre>'; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment