Created
April 8, 2011 11:52
Revisions
-
rodneyrehm revised this gist
Apr 9, 2011 . 1 changed file with 1 addition and 3 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -2,9 +2,7 @@ /* consider decomposing the characters to "capture" more "obscure" characters such as ṩ - http://www.php.net/manual/en/normalizer.normalize.php#92592 */ /** -
rodneyrehm revised this gist
Apr 8, 2011 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -4,6 +4,7 @@ consider decomposing the characters to "capture" more "obscure" characters such as ṩ - http://unicode.org/reports/tr15/ - http://developer.apple.com/library/mac/#technotes/tn/tn1150table.html - https://gist.github.com/42793 */ /** -
rodneyrehm revised this gist
Apr 8, 2011 . 1 changed file with 6 additions and 0 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,5 +1,11 @@ <?php /* consider decomposing the characters to "capture" more "obscure" characters such as ṩ - http://unicode.org/reports/tr15/ - http://developer.apple.com/library/mac/#technotes/tn/tn1150table.html */ /** * Normalize a string to only contain alphanumeric characters and dashes. * -
rodneyrehm revised this gist
Apr 8, 2011 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -11,7 +11,7 @@ * @param array|string $allow List of Characters that are to be ignored while urlifying array( unicode => true ) or string like "$é@" * @param array $replace List of Characters that are to be replaced while urlifying array( unicode => "character" ) * @return string normalized string * @author Christian Kruse <cjk+os@wwwtech.de> * @author Rodney Rehm <rodney.rehm@medialize.de> */ function urlify($string, $trim=true, $allow=null, $replace=null) -
rodneyrehm revised this gist
Apr 8, 2011 . 2 changed files with 57 additions and 32 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -8,7 +8,7 @@ * @note mb_internal_charset() must be set to whatever encoding $string had originally * @param string $string String to normalize * @param boolean $trim Trim string to not start/end with a dash and not contain dash sequences * @param array|string $allow List of Characters that are to be ignored while urlifying array( unicode => true ) or string like "$é@" * @param array $replace List of Characters that are to be replaced while urlifying array( unicode => "character" ) * @return string normalized string * @author Christian Kruse <cjk@wwwtech.de> @@ -19,72 +19,80 @@ function urlify($string, $trim=true, $allow=null, $replace=null) if (!is_string($string)) { throw new Exception('$string must be a string'); } $_replace = array( 0xE4 => "\0\0\0\x61\0\0\0\x65", // ä 0xC4 => "\0\0\0\x41\0\0\0\x65", // Ä 0xF6 => "\0\0\0\x6F\0\0\0\x65", // ö 0xD6 => "\0\0\0\x4F\0\0\0\x65", // Ö 0xFC => "\0\0\0\x75\0\0\0\x65", // ü 0xDC => "\0\0\0\x55\0\0\0\x65", // Ü 0xDF => "\0\0\0\x73\0\0\0\x73", // ß 0xE6 => "\0\0\0\x61\0\0\0\x65", // æ 0xC6 => "\0\0\0\x41\0\0\0\x65", // Æ ); if ($replace && is_array($replace)) { foreach ($replace as $k => $v) { $_replace[$k] = mb_convert_encoding($v, "UTF-32BE"); } } if ($allow && is_string($allow)) { $t = mb_convert_encoding($allow, "UTF-32BE"); $t = unpack("N*", $t); $allow = array(); foreach ($t as $k) { $allow[$k] = true; } } elseif ($allow && !is_array($allow)) { $allow = null; } $res = ''; $string = mb_convert_encoding($string, "UTF-32BE"); $unicodes = unpack("N*", $string); $i = -1; foreach ($unicodes as $code) { $i++; if (($code >= 97 && $code <= 122) || ($code >= 65 && $code <= 90) || ($code >= 48 && $code <= 57) || $code == 95 || $code == 45) { // skip normalization for alphanumeric characters [a-zA-Z0-9_-] $res .= mb_substr($string, $i, 1, "UTF-32BE"); } elseif ($allow && isset($allow[$code])) { // skip normalization for allowed characters $res .= mb_substr($string, $i, 1, "UTF-32BE"); } elseif (isset($_replace[$code])) { // replace as defined $res .= $_replace[$code]; } elseif (($code >= 0xC0 && $code <= 0xC6) || ($code >= 0xE0 && $code <= 0xE6) || ($code >= 0x100 && $code <= 0x105)) { $res .= "\0\0\0\x61"; // a } elseif ($code == 0xC7 || $code == 0xE7 || ($code >= 0x106 && $code <= 0x10D)) { $res .= "\0\0\0\x63"; // c } elseif ($code == 0xD0 || ($code >= 0x10E && $code <= 0x111)) { $res .= "\0\0\0\x64"; // d } elseif (($code >= 0xC8 && $code <= 0xCB) || ($code >= 0xE8 && $code <= 0xEB) || ($code >= 0x112 && $code <= 0x11B)) { $res .= "\0\0\0\x65"; // e } elseif (($code >= 0xCC && $code <= 0xCF) || ($code >= 0xEC && $code <= 0xEF)) { $res .= "\0\0\0\x69"; // i } elseif ($code == 0xD1 || $code == 0xF1) { $res .= "\0\0\0\x6E"; // n } elseif (($code >= 0xD2 && $code <= 0xD8) || ($code >= 0xF2 && $code <= 0xF8)) { $res .= "\0\0\0\x6F"; // o } elseif (($code >= 0xD9 && $code <= 0xDB) || ($code >= 0xF9 && $code <= 0xFB)) { $res .= "\0\0\0\x75"; // u } elseif ($code == 0xDD || $code == 0xFD || $code == 0xFF) { $res .= "\0\0\0\x79"; // y } else { $res .= "\0\0\0\x2D"; // - } } if ($trim) { $res = preg_replace('#(\0\0\0\x2D){2,}#', "\0\0\0\x2D", $res); $res = preg_replace('#(^\0\0\0\x2D)|(\0\0\0\x2D$)#', "", $res); } return mb_convert_encoding($res, mb_internal_encoding(), "UTF-32BE"); } This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -70,4 +70,21 @@ $_r = urlify($o, true, null, array(0xF1 => 'XXX')); echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n"; } echo '</pre>'; $test = array( 'hällö wörld' => 'haelloe-woerld', 'hällö wörldß' => 'haelloe-woerldss', 'hällö wörldß' => 'haelloe-woerldss', 'hällö wörld ' => 'haelloe-woerld', 'hällö wörld %' => 'haelloe-woerld', 'héllò peôplë ÑO?' => 'hello-people-nO?', 'héllò peôplë ñO?' => 'hello-people-ñO?', ); echo '<pre>'; foreach ($test as $o => $r){ $_r = urlify($o, true, 'ñ?'); echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n"; } echo '</pre>'; -
rodneyrehm revised this gist
Apr 8, 2011 . 2 changed files with 55 additions and 1 deletion.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -31,7 +31,9 @@ function urlify($string, $trim=true, $allow=null, $replace=null) ); if ($replace) { foreach ($replace as $k => $v) { $_replace[$k] = $v; } } $res = ''; This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -18,4 +18,56 @@ $_r = urlify($o); echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n"; } echo '</pre>'; $test = array( 'hällö wörld' => 'haelloe-woerld', 'hällö wörldß' => 'haelloe-woerldss', 'hällö wörldß' => 'haelloe-woerldss', 'hällö wörld ' => 'haelloe-woerld-', 'hällö wörld %' => 'haelloe-woerld--', 'héllò peôplë ÑO?' => 'hello-people-nO-', ); echo '<pre>'; foreach ($test as $o => $r){ $_r = urlify($o, false); echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n"; } echo '</pre>'; $test = array( 'hällö wörld' => 'haelloe-woerld', 'hällö wörldß' => 'haelloe-woerldss', 'hällö wörldß' => 'haelloe-woerldss', 'hällö wörld ' => 'haelloe-woerld', 'hällö wörld %' => 'haelloe-woerld', 'héllò peôplë ÑO?' => 'hello-people-nO', 'héllò peôplë ñO?' => 'hello-people-ñO', ); echo '<pre>'; foreach ($test as $o => $r){ $_r = urlify($o, true, array(0xF1 => true)); echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n"; } echo '</pre>'; $test = array( 'hällö wörld' => 'haelloe-woerld', 'hällö wörldß' => 'haelloe-woerldss', 'hällö wörldß' => 'haelloe-woerldss', 'hällö wörld ' => 'haelloe-woerld', 'hällö wörld %' => 'haelloe-woerld', 'héllò peôplë ÑO?' => 'hello-people-nO', 'héllò peôplë ñO?' => 'hello-people-XXXO', ); echo '<pre>'; foreach ($test as $o => $r){ $_r = urlify($o, true, null, array(0xF1 => 'XXX')); echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n"; } echo '</pre>'; -
rodneyrehm revised this gist
Apr 8, 2011 . 1 changed file with 10 additions and 9 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -5,7 +5,7 @@ * * Replace accents by their entities. * Replace everything else by - (dash). * @note mb_internal_charset() must be set to whatever encoding $string had originally * @param string $string String to normalize * @param boolean $trim Trim string to not start/end with a dash and not contain dash sequences * @param array $allow List of Characters that are to be ignored while urlifying array( unicode => true ) @@ -17,14 +17,9 @@ function urlify($string, $trim=true, $allow=null, $replace=null) { if (!is_string($string)) { throw new Exception('$string must be a string'); } $_replace = array( 0xE4 => 'ae', 0xC4 => 'Ae', @@ -39,16 +34,22 @@ function urlify($string, $trim=true, $allow=null, $replace=null) $_replace = array_merge($_replace, $replace); } $res = ''; $encoding = mb_internal_encoding(); $string = mb_convert_encoding($string, "UTF-32BE"); $unicodes = unpack("N*", $string); $i = -1; foreach ($unicodes as $code) { $i++; $character = mb_substr($string, $i, 1, "UTF-32BE"); if (($code >= 97 && $code <= 122) || ($code >= 65 && $code <= 90) || ($code >= 48 && $code <= 57) || $code == 95 || $code == 45) { // skip normalization for alphanumeric characters [a-zA-Z0-9_-] $res .= mb_convert_encoding($character, $encoding, "UTF-32BE"); } elseif ($allow && isset($allow[$code])) { // skip normalization for allowed characters $res .= mb_convert_encoding($character, $encoding, "UTF-32BE"); } elseif (isset($_replace[$code])) { // replace as defined $res .= $_replace[$code]; -
rodneyrehm revised this gist
Apr 8, 2011 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -11,8 +11,8 @@ * @param array $allow List of Characters that are to be ignored while urlifying array( unicode => true ) * @param array $replace List of Characters that are to be replaced while urlifying array( unicode => "character" ) * @return string normalized string * @author Christian Kruse <cjk@wwwtech.de> * @author Rodney Rehm <rodney.rehm@medialize.de> */ function urlify($string, $trim=true, $allow=null, $replace=null) { -
rodneyrehm created this gist
Apr 8, 2011 .There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,87 @@ <?php /** * Normalize a string to only contain alphanumeric characters and dashes. * * Replace accents by their entities. * Replace everything else by - (dash). * @note mb_internal_charset() must be set to whatever encoding $str had originally * @param string $string String to normalize * @param boolean $trim Trim string to not start/end with a dash and not contain dash sequences * @param array $allow List of Characters that are to be ignored while urlifying array( unicode => true ) * @param array $replace List of Characters that are to be replaced while urlifying array( unicode => "character" ) * @return string normalized string * @author Christian Kruse * @author Rodney Rehm */ function urlify($string, $trim=true, $allow=null, $replace=null) { if (!is_string($string)) { throw new Exception('$str must be a string'); } $res = ''; $string = mb_convert_encoding($string, "UTF-32BE", "UTF-8"); $unicodes = unpack("N*", $string); $i = -1; $_replace = array( 0xE4 => 'ae', 0xC4 => 'Ae', 0xF6 => 'oe', 0xD6 => 'Oe', 0xFC => 'ue', 0xDC => 'Ue', 0xDF => 'ss', ); if ($replace) { $_replace = array_merge($_replace, $replace); } foreach ($unicodes as $code) { $i++; $character = mb_substr($string, $i, 1, "UTF-32BE"); if (($code >= 97 && $code <= 122) || ($code >= 65 && $code <= 90) || ($code >= 48 && $code <= 57) || $code == 95 || $code == 45) { // skip normalization for alphanumeric characters [a-zA-Z0-9_-] $res .= mb_convert_encoding($character, "UTF-8", "UTF-32BE"); } elseif ($allow && isset($allow[$code])) { // skip normalization for allowed characters $res .= mb_convert_encoding($character, "UTF-8", "UTF-32BE"); } elseif (isset($_replace[$code])) { // replace as defined $res .= $_replace[$code]; } elseif (ctype_space($character)) { // replace spaces by dash $res .= '-'; } elseif (($code >= 0xC0 && $code <= 0xC6) || ($code >= 0xE0 && $code <= 0xE6) || ($code >= 0x100 && $code <= 0x105)) { $res .= 'a'; } elseif ($code == 0xC7 || $code == 0xE7 || ($code >= 0x106 && $code <= 0x10D)) { $res .= 'a'; } elseif ($code == 0xD0 || ($code >= 0x10E && $code <= 0x111)) { $res .= 'd'; } elseif (($code >= 0xC8 && $code <= 0xCB) || ($code >= 0xE8 && $code <= 0xEB) || ($code >= 0x112 && $code <= 0x11B)) { $res .= 'e'; } elseif (($code >= 0xCC && $code <= 0xCF) || ($code >= 0xEC && $code <= 0xEF)) { $res .= 'i'; } elseif ($code == 0xD1 || $code == 0xF1) { $res .= 'n'; } elseif (($code >= 0xD2 && $code <= 0xD8) || ($code >= 0xF2 && $code <= 0xF8)) { $res .= 'o'; } elseif (($code >= 0xD9 && $code <= 0xDB) || ($code >= 0xF9 && $code <= 0xFB)) { $res .= 'u'; } elseif ($code == 0xDD || $code == 0xFD || $code == 0xFF) { $res .= 'y'; } else { $res .= '-'; } } if (!$trim) { return $res; } $res = preg_replace('#-{2,}#', '-', $res); return trim($res, '-'); } This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,21 @@ <?php include dirname(__FILE__) . '/urlify.php'; mb_internal_encoding('UTF-8'); $test = array( 'hällö wörld' => 'haelloe-woerld', 'hällö wörldß' => 'haelloe-woerldss', 'hällö wörldß' => 'haelloe-woerldss', 'hällö wörld ' => 'haelloe-woerld', 'hällö wörld %' => 'haelloe-woerld', 'héllò peôplë ÑO?' => 'hello-people-nO', ); echo '<pre>'; foreach ($test as $o => $r){ $_r = urlify($o); echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n"; } echo '</pre>';