Skip to content

Instantly share code, notes, and snippets.

@rodneyrehm
Created April 8, 2011 11:52

Revisions

  1. rodneyrehm revised this gist Apr 9, 2011. 1 changed file with 1 addition and 3 deletions.
    4 changes: 1 addition & 3 deletions urlify.php
    Original file line number Diff line number Diff line change
    @@ -2,9 +2,7 @@

    /*
    consider decomposing the characters to "capture" more "obscure" characters such as ṩ
    - http://unicode.org/reports/tr15/
    - http://developer.apple.com/library/mac/#technotes/tn/tn1150table.html
    - https://gist.github.com/42793
    - http://www.php.net/manual/en/normalizer.normalize.php#92592
    */

    /**
  2. rodneyrehm revised this gist Apr 8, 2011. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions urlify.php
    Original file line number Diff line number Diff line change
    @@ -4,6 +4,7 @@
    consider decomposing the characters to "capture" more "obscure" characters such as ṩ
    - http://unicode.org/reports/tr15/
    - http://developer.apple.com/library/mac/#technotes/tn/tn1150table.html
    - https://gist.github.com/42793
    */

    /**
  3. rodneyrehm revised this gist Apr 8, 2011. 1 changed file with 6 additions and 0 deletions.
    6 changes: 6 additions & 0 deletions urlify.php
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,11 @@
    <?php

    /*
    consider decomposing the characters to "capture" more "obscure" characters such as ṩ
    - http://unicode.org/reports/tr15/
    - http://developer.apple.com/library/mac/#technotes/tn/tn1150table.html
    */

    /**
    * Normalize a string to only contain alphanumeric characters and dashes.
    *
  4. rodneyrehm revised this gist Apr 8, 2011. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion urlify.php
    Original file line number Diff line number Diff line change
    @@ -11,7 +11,7 @@
    * @param array|string $allow List of Characters that are to be ignored while urlifying array( unicode => true ) or string like "$é@"
    * @param array $replace List of Characters that are to be replaced while urlifying array( unicode => "character" )
    * @return string normalized string
    * @author Christian Kruse <cjk@wwwtech.de>
    * @author Christian Kruse <cjk+os@wwwtech.de>
    * @author Rodney Rehm <rodney.rehm@medialize.de>
    */
    function urlify($string, $trim=true, $allow=null, $replace=null)
  5. rodneyrehm revised this gist Apr 8, 2011. 2 changed files with 57 additions and 32 deletions.
    72 changes: 40 additions & 32 deletions urlify.php
    Original file line number Diff line number Diff line change
    @@ -8,7 +8,7 @@
    * @note mb_internal_charset() must be set to whatever encoding $string had originally
    * @param string $string String to normalize
    * @param boolean $trim Trim string to not start/end with a dash and not contain dash sequences
    * @param array $allow List of Characters that are to be ignored while urlifying array( unicode => true )
    * @param array|string $allow List of Characters that are to be ignored while urlifying array( unicode => true ) or string like "$é@"
    * @param array $replace List of Characters that are to be replaced while urlifying array( unicode => "character" )
    * @return string normalized string
    * @author Christian Kruse <cjk@wwwtech.de>
    @@ -19,72 +19,80 @@ function urlify($string, $trim=true, $allow=null, $replace=null)
    if (!is_string($string)) {
    throw new Exception('$string must be a string');
    }

    $_replace = array(
    0xE4 => 'ae',
    0xC4 => 'Ae',
    0xF6 => 'oe',
    0xD6 => 'Oe',
    0xFC => 'ue',
    0xDC => 'Ue',
    0xDF => 'ss',
    0xE4 => "\0\0\0\x61\0\0\0\x65", // ä
    0xC4 => "\0\0\0\x41\0\0\0\x65", // Ä
    0xF6 => "\0\0\0\x6F\0\0\0\x65", // ö
    0xD6 => "\0\0\0\x4F\0\0\0\x65", // Ö
    0xFC => "\0\0\0\x75\0\0\0\x65", // ü
    0xDC => "\0\0\0\x55\0\0\0\x65", // Ü
    0xDF => "\0\0\0\x73\0\0\0\x73", // ß
    0xE6 => "\0\0\0\x61\0\0\0\x65", // æ
    0xC6 => "\0\0\0\x41\0\0\0\x65", // Æ
    );

    if ($replace) {
    if ($replace && is_array($replace)) {
    foreach ($replace as $k => $v) {
    $_replace[$k] = $v;
    $_replace[$k] = mb_convert_encoding($v, "UTF-32BE");
    }
    }

    if ($allow && is_string($allow)) {
    $t = mb_convert_encoding($allow, "UTF-32BE");
    $t = unpack("N*", $t);
    $allow = array();
    foreach ($t as $k) {
    $allow[$k] = true;
    }
    } elseif ($allow && !is_array($allow)) {
    $allow = null;
    }

    $res = '';
    $encoding = mb_internal_encoding();
    $string = mb_convert_encoding($string, "UTF-32BE");
    $unicodes = unpack("N*", $string);
    $i = -1;

    foreach ($unicodes as $code) {
    $i++;
    $character = mb_substr($string, $i, 1, "UTF-32BE");

    if (($code >= 97 && $code <= 122) || ($code >= 65 && $code <= 90) || ($code >= 48 && $code <= 57) || $code == 95 || $code == 45) {
    // skip normalization for alphanumeric characters [a-zA-Z0-9_-]
    $res .= mb_convert_encoding($character, $encoding, "UTF-32BE");
    $res .= mb_substr($string, $i, 1, "UTF-32BE");
    } elseif ($allow && isset($allow[$code])) {
    // skip normalization for allowed characters
    $res .= mb_convert_encoding($character, $encoding, "UTF-32BE");
    $res .= mb_substr($string, $i, 1, "UTF-32BE");
    } elseif (isset($_replace[$code])) {
    // replace as defined
    $res .= $_replace[$code];
    } elseif (ctype_space($character)) {
    // replace spaces by dash
    $res .= '-';
    } elseif (($code >= 0xC0 && $code <= 0xC6) || ($code >= 0xE0 && $code <= 0xE6) || ($code >= 0x100 && $code <= 0x105)) {
    $res .= 'a';
    $res .= "\0\0\0\x61"; // a
    } elseif ($code == 0xC7 || $code == 0xE7 || ($code >= 0x106 && $code <= 0x10D)) {
    $res .= 'a';
    $res .= "\0\0\0\x63"; // c
    } elseif ($code == 0xD0 || ($code >= 0x10E && $code <= 0x111)) {
    $res .= 'd';
    $res .= "\0\0\0\x64"; // d
    } elseif (($code >= 0xC8 && $code <= 0xCB) || ($code >= 0xE8 && $code <= 0xEB) || ($code >= 0x112 && $code <= 0x11B)) {
    $res .= 'e';
    $res .= "\0\0\0\x65"; // e
    } elseif (($code >= 0xCC && $code <= 0xCF) || ($code >= 0xEC && $code <= 0xEF)) {
    $res .= 'i';
    $res .= "\0\0\0\x69"; // i
    } elseif ($code == 0xD1 || $code == 0xF1) {
    $res .= 'n';
    $res .= "\0\0\0\x6E"; // n
    } elseif (($code >= 0xD2 && $code <= 0xD8) || ($code >= 0xF2 && $code <= 0xF8)) {
    $res .= 'o';
    $res .= "\0\0\0\x6F"; // o
    } elseif (($code >= 0xD9 && $code <= 0xDB) || ($code >= 0xF9 && $code <= 0xFB)) {
    $res .= 'u';
    $res .= "\0\0\0\x75"; // u
    } elseif ($code == 0xDD || $code == 0xFD || $code == 0xFF) {
    $res .= 'y';
    $res .= "\0\0\0\x79"; // y
    } else {
    $res .= '-';
    $res .= "\0\0\0\x2D"; // -
    }
    }

    if (!$trim) {
    return $res;
    if ($trim) {
    $res = preg_replace('#(\0\0\0\x2D){2,}#', "\0\0\0\x2D", $res);
    $res = preg_replace('#(^\0\0\0\x2D)|(\0\0\0\x2D$)#', "", $res);
    }

    $res = preg_replace('#-{2,}#', '-', $res);
    return trim($res, '-');
    return mb_convert_encoding($res, mb_internal_encoding(), "UTF-32BE");
    }
    17 changes: 17 additions & 0 deletions urlify.test.php
    Original file line number Diff line number Diff line change
    @@ -70,4 +70,21 @@
    $_r = urlify($o, true, null, array(0xF1 => 'XXX'));
    echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
    }
    echo '</pre>';

    $test = array(
    'hällö wörld' => 'haelloe-woerld',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörld ' => 'haelloe-woerld',
    'hällö wörld %' => 'haelloe-woerld',
    'héllò peôplë ÑO?' => 'hello-people-nO?',
    'héllò peôplë ñO?' => 'hello-people-ñO?',
    );

    echo '<pre>';
    foreach ($test as $o => $r){
    $_r = urlify($o, true, 'ñ?');
    echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
    }
    echo '</pre>';
  6. rodneyrehm revised this gist Apr 8, 2011. 2 changed files with 55 additions and 1 deletion.
    4 changes: 3 additions & 1 deletion urlify.php
    Original file line number Diff line number Diff line change
    @@ -31,7 +31,9 @@ function urlify($string, $trim=true, $allow=null, $replace=null)
    );

    if ($replace) {
    $_replace = array_merge($_replace, $replace);
    foreach ($replace as $k => $v) {
    $_replace[$k] = $v;
    }
    }

    $res = '';
    52 changes: 52 additions & 0 deletions urlify.test.php
    Original file line number Diff line number Diff line change
    @@ -18,4 +18,56 @@
    $_r = urlify($o);
    echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
    }
    echo '</pre>';

    $test = array(
    'hällö wörld' => 'haelloe-woerld',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörld ' => 'haelloe-woerld-',
    'hällö wörld %' => 'haelloe-woerld--',
    'héllò peôplë ÑO?' => 'hello-people-nO-',
    );

    echo '<pre>';
    foreach ($test as $o => $r){
    $_r = urlify($o, false);
    echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
    }
    echo '</pre>';


    $test = array(
    'hällö wörld' => 'haelloe-woerld',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörld ' => 'haelloe-woerld',
    'hällö wörld %' => 'haelloe-woerld',
    'héllò peôplë ÑO?' => 'hello-people-nO',
    'héllò peôplë ñO?' => 'hello-people-ñO',
    );

    echo '<pre>';
    foreach ($test as $o => $r){
    $_r = urlify($o, true, array(0xF1 => true));
    echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
    }
    echo '</pre>';


    $test = array(
    'hällö wörld' => 'haelloe-woerld',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörld ' => 'haelloe-woerld',
    'hällö wörld %' => 'haelloe-woerld',
    'héllò peôplë ÑO?' => 'hello-people-nO',
    'héllò peôplë ñO?' => 'hello-people-XXXO',
    );

    echo '<pre>';
    foreach ($test as $o => $r){
    $_r = urlify($o, true, null, array(0xF1 => 'XXX'));
    echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
    }
    echo '</pre>';
  7. rodneyrehm revised this gist Apr 8, 2011. 1 changed file with 10 additions and 9 deletions.
    19 changes: 10 additions & 9 deletions urlify.php
    Original file line number Diff line number Diff line change
    @@ -5,7 +5,7 @@
    *
    * Replace accents by their entities.
    * Replace everything else by - (dash).
    * @note mb_internal_charset() must be set to whatever encoding $str had originally
    * @note mb_internal_charset() must be set to whatever encoding $string had originally
    * @param string $string String to normalize
    * @param boolean $trim Trim string to not start/end with a dash and not contain dash sequences
    * @param array $allow List of Characters that are to be ignored while urlifying array( unicode => true )
    @@ -17,14 +17,9 @@
    function urlify($string, $trim=true, $allow=null, $replace=null)
    {
    if (!is_string($string)) {
    throw new Exception('$str must be a string');
    throw new Exception('$string must be a string');
    }

    $res = '';
    $string = mb_convert_encoding($string, "UTF-32BE", "UTF-8");
    $unicodes = unpack("N*", $string);
    $i = -1;

    $_replace = array(
    0xE4 => 'ae',
    0xC4 => 'Ae',
    @@ -39,16 +34,22 @@ function urlify($string, $trim=true, $allow=null, $replace=null)
    $_replace = array_merge($_replace, $replace);
    }

    $res = '';
    $encoding = mb_internal_encoding();
    $string = mb_convert_encoding($string, "UTF-32BE");
    $unicodes = unpack("N*", $string);
    $i = -1;

    foreach ($unicodes as $code) {
    $i++;
    $character = mb_substr($string, $i, 1, "UTF-32BE");

    if (($code >= 97 && $code <= 122) || ($code >= 65 && $code <= 90) || ($code >= 48 && $code <= 57) || $code == 95 || $code == 45) {
    // skip normalization for alphanumeric characters [a-zA-Z0-9_-]
    $res .= mb_convert_encoding($character, "UTF-8", "UTF-32BE");
    $res .= mb_convert_encoding($character, $encoding, "UTF-32BE");
    } elseif ($allow && isset($allow[$code])) {
    // skip normalization for allowed characters
    $res .= mb_convert_encoding($character, "UTF-8", "UTF-32BE");
    $res .= mb_convert_encoding($character, $encoding, "UTF-32BE");
    } elseif (isset($_replace[$code])) {
    // replace as defined
    $res .= $_replace[$code];
  8. rodneyrehm revised this gist Apr 8, 2011. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions urlify.php
    Original file line number Diff line number Diff line change
    @@ -11,8 +11,8 @@
    * @param array $allow List of Characters that are to be ignored while urlifying array( unicode => true )
    * @param array $replace List of Characters that are to be replaced while urlifying array( unicode => "character" )
    * @return string normalized string
    * @author Christian Kruse
    * @author Rodney Rehm
    * @author Christian Kruse <cjk@wwwtech.de>
    * @author Rodney Rehm <rodney.rehm@medialize.de>
    */
    function urlify($string, $trim=true, $allow=null, $replace=null)
    {
  9. rodneyrehm created this gist Apr 8, 2011.
    87 changes: 87 additions & 0 deletions urlify.php
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,87 @@
    <?php

    /**
    * Normalize a string to only contain alphanumeric characters and dashes.
    *
    * Replace accents by their entities.
    * Replace everything else by - (dash).
    * @note mb_internal_charset() must be set to whatever encoding $str had originally
    * @param string $string String to normalize
    * @param boolean $trim Trim string to not start/end with a dash and not contain dash sequences
    * @param array $allow List of Characters that are to be ignored while urlifying array( unicode => true )
    * @param array $replace List of Characters that are to be replaced while urlifying array( unicode => "character" )
    * @return string normalized string
    * @author Christian Kruse
    * @author Rodney Rehm
    */
    function urlify($string, $trim=true, $allow=null, $replace=null)
    {
    if (!is_string($string)) {
    throw new Exception('$str must be a string');
    }

    $res = '';
    $string = mb_convert_encoding($string, "UTF-32BE", "UTF-8");
    $unicodes = unpack("N*", $string);
    $i = -1;

    $_replace = array(
    0xE4 => 'ae',
    0xC4 => 'Ae',
    0xF6 => 'oe',
    0xD6 => 'Oe',
    0xFC => 'ue',
    0xDC => 'Ue',
    0xDF => 'ss',
    );

    if ($replace) {
    $_replace = array_merge($_replace, $replace);
    }

    foreach ($unicodes as $code) {
    $i++;
    $character = mb_substr($string, $i, 1, "UTF-32BE");

    if (($code >= 97 && $code <= 122) || ($code >= 65 && $code <= 90) || ($code >= 48 && $code <= 57) || $code == 95 || $code == 45) {
    // skip normalization for alphanumeric characters [a-zA-Z0-9_-]
    $res .= mb_convert_encoding($character, "UTF-8", "UTF-32BE");
    } elseif ($allow && isset($allow[$code])) {
    // skip normalization for allowed characters
    $res .= mb_convert_encoding($character, "UTF-8", "UTF-32BE");
    } elseif (isset($_replace[$code])) {
    // replace as defined
    $res .= $_replace[$code];
    } elseif (ctype_space($character)) {
    // replace spaces by dash
    $res .= '-';
    } elseif (($code >= 0xC0 && $code <= 0xC6) || ($code >= 0xE0 && $code <= 0xE6) || ($code >= 0x100 && $code <= 0x105)) {
    $res .= 'a';
    } elseif ($code == 0xC7 || $code == 0xE7 || ($code >= 0x106 && $code <= 0x10D)) {
    $res .= 'a';
    } elseif ($code == 0xD0 || ($code >= 0x10E && $code <= 0x111)) {
    $res .= 'd';
    } elseif (($code >= 0xC8 && $code <= 0xCB) || ($code >= 0xE8 && $code <= 0xEB) || ($code >= 0x112 && $code <= 0x11B)) {
    $res .= 'e';
    } elseif (($code >= 0xCC && $code <= 0xCF) || ($code >= 0xEC && $code <= 0xEF)) {
    $res .= 'i';
    } elseif ($code == 0xD1 || $code == 0xF1) {
    $res .= 'n';
    } elseif (($code >= 0xD2 && $code <= 0xD8) || ($code >= 0xF2 && $code <= 0xF8)) {
    $res .= 'o';
    } elseif (($code >= 0xD9 && $code <= 0xDB) || ($code >= 0xF9 && $code <= 0xFB)) {
    $res .= 'u';
    } elseif ($code == 0xDD || $code == 0xFD || $code == 0xFF) {
    $res .= 'y';
    } else {
    $res .= '-';
    }
    }

    if (!$trim) {
    return $res;
    }

    $res = preg_replace('#-{2,}#', '-', $res);
    return trim($res, '-');
    }
    21 changes: 21 additions & 0 deletions urlify.test.php
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,21 @@
    <?php

    include dirname(__FILE__) . '/urlify.php';

    mb_internal_encoding('UTF-8');

    $test = array(
    'hällö wörld' => 'haelloe-woerld',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörld ' => 'haelloe-woerld',
    'hällö wörld %' => 'haelloe-woerld',
    'héllò peôplë ÑO?' => 'hello-people-nO',
    );

    echo '<pre>';
    foreach ($test as $o => $r){
    $_r = urlify($o);
    echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
    }
    echo '</pre>';