rodneyrehm · April 8, 2011 11:52 · Apr 9, 2011 · Apr 8, 2011 · Apr 8, 2011 · Apr 8, 2011
diff --git a/urlify.php b/urlify.php
@@ -2,9 +2,7 @@
 
 /*
   consider decomposing the characters to "capture" more "obscure" characters such as ṩ
-  - http://unicode.org/reports/tr15/
-  - http://developer.apple.com/library/mac/#technotes/tn/tn1150table.html
-  - https://gist.github.com/42793
+    - http://www.php.net/manual/en/normalizer.normalize.php#92592
 */
 
 /**

diff --git a/urlify.php b/urlify.php
@@ -4,6 +4,7 @@
   consider decomposing the characters to "capture" more "obscure" characters such as ṩ
   - http://unicode.org/reports/tr15/
   - http://developer.apple.com/library/mac/#technotes/tn/tn1150table.html
+  - https://gist.github.com/42793
 */
 
 /**

diff --git a/urlify.php b/urlify.php
@@ -1,5 +1,11 @@
 <?php
 
+/*
+  consider decomposing the characters to "capture" more "obscure" characters such as ṩ
+  - http://unicode.org/reports/tr15/
+  - http://developer.apple.com/library/mac/#technotes/tn/tn1150table.html
+*/
+
 /**
  * Normalize a string to only contain alphanumeric characters and dashes.
  *

diff --git a/urlify.php b/urlify.php
@@ -11,7 +11,7 @@
  * @param array|string $allow List of Characters that are to be ignored while urlifying array( unicode => true ) or string like "$é@"
  * @param array $replace List of Characters that are to be replaced while urlifying array( unicode => "character" )
  * @return string normalized string
- * @author Christian Kruse <cjk@wwwtech.de>
+ * @author Christian Kruse <cjk+os@wwwtech.de>
  * @author Rodney Rehm <rodney.rehm@medialize.de>
  */
 function urlify($string, $trim=true, $allow=null, $replace=null)

diff --git a/urlify.php b/urlify.php
@@ -8,7 +8,7 @@
  * @note mb_internal_charset() must be set to whatever encoding $string had originally
  * @param string $string String to normalize
  * @param boolean $trim Trim string to not start/end with a dash and not contain dash sequences
- * @param array $allow List of Characters that are to be ignored while urlifying array( unicode => true )
+ * @param array|string $allow List of Characters that are to be ignored while urlifying array( unicode => true ) or string like "$é@"
  * @param array $replace List of Characters that are to be replaced while urlifying array( unicode => "character" )
  * @return string normalized string
  * @author Christian Kruse <cjk@wwwtech.de>
@@ -19,72 +19,80 @@ function urlify($string, $trim=true, $allow=null, $replace=null)
     if (!is_string($string)) {
         throw new Exception('$string must be a string');
     }
-
+    
     $_replace = array(
-        0xE4 => 'ae',
-        0xC4 => 'Ae',
-        0xF6 => 'oe',
-        0xD6 => 'Oe',
-        0xFC => 'ue',
-        0xDC => 'Ue',
-        0xDF => 'ss',
+        0xE4 => "\0\0\0\x61\0\0\0\x65", // ä
+        0xC4 => "\0\0\0\x41\0\0\0\x65", // Ä
+        0xF6 => "\0\0\0\x6F\0\0\0\x65", // ö
+        0xD6 => "\0\0\0\x4F\0\0\0\x65", // Ö
+        0xFC => "\0\0\0\x75\0\0\0\x65", // ü
+        0xDC => "\0\0\0\x55\0\0\0\x65", // Ü
+        0xDF => "\0\0\0\x73\0\0\0\x73", // ß
+        0xE6 => "\0\0\0\x61\0\0\0\x65", // æ
+        0xC6 => "\0\0\0\x41\0\0\0\x65", // Æ
     );
 
-    if ($replace) {
+    if ($replace && is_array($replace)) {
         foreach ($replace as $k => $v) {
-            $_replace[$k] = $v;
+            $_replace[$k] = mb_convert_encoding($v, "UTF-32BE");
+        }
+    }
+
+    if ($allow && is_string($allow)) {
+        $t = mb_convert_encoding($allow, "UTF-32BE");
+        $t = unpack("N*", $t);
+        $allow = array();
+        foreach ($t as $k) {
+            $allow[$k] = true;
         }
+    } elseif ($allow && !is_array($allow)) {
+        $allow = null;
     }
 
     $res = '';
-    $encoding = mb_internal_encoding();
     $string = mb_convert_encoding($string, "UTF-32BE");
     $unicodes = unpack("N*", $string);
     $i = -1;
 
     foreach ($unicodes as $code) {
         $i++;
-        $character = mb_substr($string, $i, 1, "UTF-32BE");
 
         if (($code >= 97 && $code <= 122) || ($code >= 65 && $code <= 90) || ($code >= 48 && $code <= 57) || $code == 95 || $code == 45) {
             // skip normalization for alphanumeric characters [a-zA-Z0-9_-]
-            $res .= mb_convert_encoding($character, $encoding, "UTF-32BE");
+            $res .= mb_substr($string, $i, 1, "UTF-32BE");
         } elseif ($allow && isset($allow[$code])) {
             // skip normalization for allowed characters
-            $res .= mb_convert_encoding($character, $encoding, "UTF-32BE");
+            $res .= mb_substr($string, $i, 1, "UTF-32BE");
         } elseif (isset($_replace[$code])) {
             // replace as defined
             $res .= $_replace[$code];
-        } elseif (ctype_space($character)) {
-            // replace spaces by dash
-            $res .= '-';
         } elseif (($code >= 0xC0 && $code <= 0xC6) || ($code >= 0xE0 && $code <= 0xE6) || ($code >= 0x100 && $code <= 0x105)) {
-            $res .= 'a';
+            $res .= "\0\0\0\x61"; // a
         } elseif ($code == 0xC7 || $code == 0xE7 || ($code >= 0x106 && $code <= 0x10D)) {
-            $res .= 'a';
+            $res .= "\0\0\0\x63"; // c
         } elseif ($code == 0xD0 || ($code >= 0x10E && $code <= 0x111)) {
-            $res .= 'd';
+            $res .= "\0\0\0\x64"; // d
         } elseif (($code >= 0xC8 && $code <= 0xCB) || ($code >= 0xE8 && $code <= 0xEB) || ($code >= 0x112 && $code <= 0x11B)) {
-            $res .= 'e';
+            $res .= "\0\0\0\x65"; // e
         } elseif (($code >= 0xCC && $code <= 0xCF) || ($code >= 0xEC && $code <= 0xEF)) {
-            $res .= 'i';
+            $res .= "\0\0\0\x69"; // i
         } elseif ($code == 0xD1 || $code == 0xF1) {
-            $res .= 'n';
+            $res .= "\0\0\0\x6E"; // n
         } elseif (($code >= 0xD2 && $code <= 0xD8) || ($code >= 0xF2 && $code <= 0xF8)) {
-            $res .= 'o';
+            $res .= "\0\0\0\x6F"; // o
         } elseif (($code >= 0xD9 && $code <= 0xDB) || ($code >= 0xF9 && $code <= 0xFB)) {
-            $res .= 'u';
+            $res .= "\0\0\0\x75"; // u
         } elseif ($code == 0xDD || $code == 0xFD || $code == 0xFF) {
-            $res .= 'y';
+            $res .= "\0\0\0\x79"; // y
         } else {
-            $res .= '-';
+            $res .= "\0\0\0\x2D"; // -
         }
     }
 
-    if (!$trim) {
-        return $res;
+    if ($trim) {
+        $res = preg_replace('#(\0\0\0\x2D){2,}#', "\0\0\0\x2D", $res);
+        $res = preg_replace('#(^\0\0\0\x2D)|(\0\0\0\x2D$)#', "", $res);
     }
 
-    $res = preg_replace('#-{2,}#', '-', $res);
-    return trim($res, '-');
+    return mb_convert_encoding($res, mb_internal_encoding(), "UTF-32BE");
 }
diff --git a/urlify.test.php b/urlify.test.php
@@ -70,4 +70,21 @@
     $_r = urlify($o, true, null, array(0xF1 => 'XXX'));
     echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
 }
+echo '</pre>';
+
+$test = array(
+    'hällö wörld' => 'haelloe-woerld',
+    'hällö wörldß' => 'haelloe-woerldss',
+    'hällö wörldß' => 'haelloe-woerldss',
+    'hällö wörld ' => 'haelloe-woerld',
+    'hällö wörld %' => 'haelloe-woerld',
+    'héllò peôplë ÑO?' => 'hello-people-nO?',
+    'héllò peôplë ñO?' => 'hello-people-ñO?',
+);
+
+echo '<pre>';
+foreach ($test as $o => $r){
+    $_r = urlify($o, true, 'ñ?');
+    echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
+}
 echo '</pre>';
diff --git a/urlify.php b/urlify.php
@@ -31,7 +31,9 @@ function urlify($string, $trim=true, $allow=null, $replace=null)
     );
 
     if ($replace) {
-        $_replace = array_merge($_replace, $replace);
+        foreach ($replace as $k => $v) {
+            $_replace[$k] = $v;
+        }
     }
 
     $res = '';

diff --git a/urlify.test.php b/urlify.test.php
@@ -18,4 +18,56 @@
     $_r = urlify($o);
     echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
 }
+echo '</pre>';
+
+$test = array(
+    'hällö wörld' => 'haelloe-woerld',
+    'hällö wörldß' => 'haelloe-woerldss',
+    'hällö wörldß' => 'haelloe-woerldss',
+    'hällö wörld ' => 'haelloe-woerld-',
+    'hällö wörld %' => 'haelloe-woerld--',
+    'héllò peôplë ÑO?' => 'hello-people-nO-',
+);
+
+echo '<pre>';
+foreach ($test as $o => $r){
+    $_r = urlify($o, false);
+    echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
+}
+echo '</pre>';
+
+
+$test = array(
+    'hällö wörld' => 'haelloe-woerld',
+    'hällö wörldß' => 'haelloe-woerldss',
+    'hällö wörldß' => 'haelloe-woerldss',
+    'hällö wörld ' => 'haelloe-woerld',
+    'hällö wörld %' => 'haelloe-woerld',
+    'héllò peôplë ÑO?' => 'hello-people-nO',
+    'héllò peôplë ñO?' => 'hello-people-ñO',
+);
+
+echo '<pre>';
+foreach ($test as $o => $r){
+    $_r = urlify($o, true, array(0xF1 => true));
+    echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
+}
+echo '</pre>';
+
+
+$test = array(
+    'hällö wörld' => 'haelloe-woerld',
+    'hällö wörldß' => 'haelloe-woerldss',
+    'hällö wörldß' => 'haelloe-woerldss',
+    'hällö wörld ' => 'haelloe-woerld',
+    'hällö wörld %' => 'haelloe-woerld',
+    'héllò peôplë ÑO?' => 'hello-people-nO',
+    'héllò peôplë ñO?' => 'hello-people-XXXO',
+);
+
+echo '<pre>';
+foreach ($test as $o => $r){
+    $_r = urlify($o, true, null, array(0xF1 => 'XXX'));
+    echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
+}
 echo '</pre>';
diff --git a/urlify.php b/urlify.php
@@ -5,7 +5,7 @@
  *
  * Replace accents by their entities. 
  * Replace everything else by - (dash).
- * @note mb_internal_charset() must be set to whatever encoding $str had originally
+ * @note mb_internal_charset() must be set to whatever encoding $string had originally
  * @param string $string String to normalize
  * @param boolean $trim Trim string to not start/end with a dash and not contain dash sequences
  * @param array $allow List of Characters that are to be ignored while urlifying array( unicode => true )
@@ -17,14 +17,9 @@
 function urlify($string, $trim=true, $allow=null, $replace=null)
 {
     if (!is_string($string)) {
-        throw new Exception('$str must be a string');
+        throw new Exception('$string must be a string');
     }
 
-    $res = '';
-    $string = mb_convert_encoding($string, "UTF-32BE", "UTF-8");
-    $unicodes = unpack("N*", $string);
-    $i = -1;
-
     $_replace = array(
         0xE4 => 'ae',
         0xC4 => 'Ae',
@@ -39,16 +34,22 @@ function urlify($string, $trim=true, $allow=null, $replace=null)
         $_replace = array_merge($_replace, $replace);
     }
 
+    $res = '';
+    $encoding = mb_internal_encoding();
+    $string = mb_convert_encoding($string, "UTF-32BE");
+    $unicodes = unpack("N*", $string);
+    $i = -1;
+
     foreach ($unicodes as $code) {
         $i++;
         $character = mb_substr($string, $i, 1, "UTF-32BE");
 
         if (($code >= 97 && $code <= 122) || ($code >= 65 && $code <= 90) || ($code >= 48 && $code <= 57) || $code == 95 || $code == 45) {
             // skip normalization for alphanumeric characters [a-zA-Z0-9_-]
-            $res .= mb_convert_encoding($character, "UTF-8", "UTF-32BE");
+            $res .= mb_convert_encoding($character, $encoding, "UTF-32BE");
         } elseif ($allow && isset($allow[$code])) {
             // skip normalization for allowed characters
-            $res .= mb_convert_encoding($character, "UTF-8", "UTF-32BE");
+            $res .= mb_convert_encoding($character, $encoding, "UTF-32BE");
         } elseif (isset($_replace[$code])) {
             // replace as defined
             $res .= $_replace[$code];

diff --git a/urlify.php b/urlify.php
@@ -11,8 +11,8 @@
  * @param array $allow List of Characters that are to be ignored while urlifying array( unicode => true )
  * @param array $replace List of Characters that are to be replaced while urlifying array( unicode => "character" )
  * @return string normalized string
- * @author Christian Kruse
- * @author Rodney Rehm
+ * @author Christian Kruse <cjk@wwwtech.de>
+ * @author Rodney Rehm <rodney.rehm@medialize.de>
  */
 function urlify($string, $trim=true, $allow=null, $replace=null)
 {

diff --git a/urlify.php b/urlify.php
@@ -0,0 +1,87 @@
+<?php
+
+/**
+ * Normalize a string to only contain alphanumeric characters and dashes.
+ *
+ * Replace accents by their entities. 
+ * Replace everything else by - (dash).
+ * @note mb_internal_charset() must be set to whatever encoding $str had originally
+ * @param string $string String to normalize
+ * @param boolean $trim Trim string to not start/end with a dash and not contain dash sequences
+ * @param array $allow List of Characters that are to be ignored while urlifying array( unicode => true )
+ * @param array $replace List of Characters that are to be replaced while urlifying array( unicode => "character" )
+ * @return string normalized string
+ * @author Christian Kruse
+ * @author Rodney Rehm
+ */
+function urlify($string, $trim=true, $allow=null, $replace=null)
+{
+    if (!is_string($string)) {
+        throw new Exception('$str must be a string');
+    }
+
+    $res = '';
+    $string = mb_convert_encoding($string, "UTF-32BE", "UTF-8");
+    $unicodes = unpack("N*", $string);
+    $i = -1;
+
+    $_replace = array(
+        0xE4 => 'ae',
+        0xC4 => 'Ae',
+        0xF6 => 'oe',
+        0xD6 => 'Oe',
+        0xFC => 'ue',
+        0xDC => 'Ue',
+        0xDF => 'ss',
+    );
+
+    if ($replace) {
+        $_replace = array_merge($_replace, $replace);
+    }
+
+    foreach ($unicodes as $code) {
+        $i++;
+        $character = mb_substr($string, $i, 1, "UTF-32BE");
+
+        if (($code >= 97 && $code <= 122) || ($code >= 65 && $code <= 90) || ($code >= 48 && $code <= 57) || $code == 95 || $code == 45) {
+            // skip normalization for alphanumeric characters [a-zA-Z0-9_-]
+            $res .= mb_convert_encoding($character, "UTF-8", "UTF-32BE");
+        } elseif ($allow && isset($allow[$code])) {
+            // skip normalization for allowed characters
+            $res .= mb_convert_encoding($character, "UTF-8", "UTF-32BE");
+        } elseif (isset($_replace[$code])) {
+            // replace as defined
+            $res .= $_replace[$code];
+        } elseif (ctype_space($character)) {
+            // replace spaces by dash
+            $res .= '-';
+        } elseif (($code >= 0xC0 && $code <= 0xC6) || ($code >= 0xE0 && $code <= 0xE6) || ($code >= 0x100 && $code <= 0x105)) {
+            $res .= 'a';
+        } elseif ($code == 0xC7 || $code == 0xE7 || ($code >= 0x106 && $code <= 0x10D)) {
+            $res .= 'a';
+        } elseif ($code == 0xD0 || ($code >= 0x10E && $code <= 0x111)) {
+            $res .= 'd';
+        } elseif (($code >= 0xC8 && $code <= 0xCB) || ($code >= 0xE8 && $code <= 0xEB) || ($code >= 0x112 && $code <= 0x11B)) {
+            $res .= 'e';
+        } elseif (($code >= 0xCC && $code <= 0xCF) || ($code >= 0xEC && $code <= 0xEF)) {
+            $res .= 'i';
+        } elseif ($code == 0xD1 || $code == 0xF1) {
+            $res .= 'n';
+        } elseif (($code >= 0xD2 && $code <= 0xD8) || ($code >= 0xF2 && $code <= 0xF8)) {
+            $res .= 'o';
+        } elseif (($code >= 0xD9 && $code <= 0xDB) || ($code >= 0xF9 && $code <= 0xFB)) {
+            $res .= 'u';
+        } elseif ($code == 0xDD || $code == 0xFD || $code == 0xFF) {
+            $res .= 'y';
+        } else {
+            $res .= '-';
+        }
+    }
+
+    if (!$trim) {
+        return $res;
+    }
+
+    $res = preg_replace('#-{2,}#', '-', $res);
+    return trim($res, '-');
+}
diff --git a/urlify.test.php b/urlify.test.php
@@ -0,0 +1,21 @@
+<?php
+
+include dirname(__FILE__) . '/urlify.php';
+
+mb_internal_encoding('UTF-8');
+
+$test = array(
+    'hällö wörld' => 'haelloe-woerld',
+    'hällö wörldß' => 'haelloe-woerldss',
+    'hällö wörldß' => 'haelloe-woerldss',
+    'hällö wörld ' => 'haelloe-woerld',
+    'hällö wörld %' => 'haelloe-woerld',
+    'héllò peôplë ÑO?' => 'hello-people-nO',
+);
+
+echo '<pre>';
+foreach ($test as $o => $r){
+    $_r = urlify($o);
+    echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
+}
+echo '</pre>';