Skip to content

Instantly share code, notes, and snippets.

@rodneyrehm
Created April 8, 2011 11:52
Show Gist options
  • Save rodneyrehm/909692 to your computer and use it in GitHub Desktop.
Save rodneyrehm/909692 to your computer and use it in GitHub Desktop.
Reduce (UTF-8) strings to alphanumeric
<?php
/**
* Normalize a string to only contain alphanumeric characters and dashes.
*
* Replace accents by their entities.
* Replace everything else by - (dash).
* @note mb_internal_charset() must be set to whatever encoding $string had originally
* @param string $string String to normalize
* @param boolean $trim Trim string to not start/end with a dash and not contain dash sequences
* @param array $allow List of Characters that are to be ignored while urlifying array( unicode => true )
* @param array $replace List of Characters that are to be replaced while urlifying array( unicode => "character" )
* @return string normalized string
* @author Christian Kruse <[email protected]>
* @author Rodney Rehm <[email protected]>
*/
function urlify($string, $trim=true, $allow=null, $replace=null)
{
if (!is_string($string)) {
throw new Exception('$string must be a string');
}
$_replace = array(
0xE4 => 'ae',
0xC4 => 'Ae',
0xF6 => 'oe',
0xD6 => 'Oe',
0xFC => 'ue',
0xDC => 'Ue',
0xDF => 'ss',
);
if ($replace) {
$_replace = array_merge($_replace, $replace);
}
$res = '';
$encoding = mb_internal_encoding();
$string = mb_convert_encoding($string, "UTF-32BE");
$unicodes = unpack("N*", $string);
$i = -1;
foreach ($unicodes as $code) {
$i++;
$character = mb_substr($string, $i, 1, "UTF-32BE");
if (($code >= 97 && $code <= 122) || ($code >= 65 && $code <= 90) || ($code >= 48 && $code <= 57) || $code == 95 || $code == 45) {
// skip normalization for alphanumeric characters [a-zA-Z0-9_-]
$res .= mb_convert_encoding($character, $encoding, "UTF-32BE");
} elseif ($allow && isset($allow[$code])) {
// skip normalization for allowed characters
$res .= mb_convert_encoding($character, $encoding, "UTF-32BE");
} elseif (isset($_replace[$code])) {
// replace as defined
$res .= $_replace[$code];
} elseif (ctype_space($character)) {
// replace spaces by dash
$res .= '-';
} elseif (($code >= 0xC0 && $code <= 0xC6) || ($code >= 0xE0 && $code <= 0xE6) || ($code >= 0x100 && $code <= 0x105)) {
$res .= 'a';
} elseif ($code == 0xC7 || $code == 0xE7 || ($code >= 0x106 && $code <= 0x10D)) {
$res .= 'a';
} elseif ($code == 0xD0 || ($code >= 0x10E && $code <= 0x111)) {
$res .= 'd';
} elseif (($code >= 0xC8 && $code <= 0xCB) || ($code >= 0xE8 && $code <= 0xEB) || ($code >= 0x112 && $code <= 0x11B)) {
$res .= 'e';
} elseif (($code >= 0xCC && $code <= 0xCF) || ($code >= 0xEC && $code <= 0xEF)) {
$res .= 'i';
} elseif ($code == 0xD1 || $code == 0xF1) {
$res .= 'n';
} elseif (($code >= 0xD2 && $code <= 0xD8) || ($code >= 0xF2 && $code <= 0xF8)) {
$res .= 'o';
} elseif (($code >= 0xD9 && $code <= 0xDB) || ($code >= 0xF9 && $code <= 0xFB)) {
$res .= 'u';
} elseif ($code == 0xDD || $code == 0xFD || $code == 0xFF) {
$res .= 'y';
} else {
$res .= '-';
}
}
if (!$trim) {
return $res;
}
$res = preg_replace('#-{2,}#', '-', $res);
return trim($res, '-');
}
<?php
include dirname(__FILE__) . '/urlify.php';
mb_internal_encoding('UTF-8');
$test = array(
'hällö wörld' => 'haelloe-woerld',
'hällö wörldß' => 'haelloe-woerldss',
'hällö wörldß' => 'haelloe-woerldss',
'hällö wörld ' => 'haelloe-woerld',
'hällö wörld %' => 'haelloe-woerld',
'héllò peôplë ÑO?' => 'hello-people-nO',
);
echo '<pre>';
foreach ($test as $o => $r){
$_r = urlify($o);
echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
}
echo '</pre>';
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment