jbroadway · May 2, 2012 22:29
diff --git a/URLify.php b/URLify.php
 <?php

 /**
 * A PHP port of URLify.js from the Django project
 * (https://github.com/django/django/blob/master/django/contrib/admin/static/admin/js/urlify.js).
 * Handles symbols from Latin languages, Greek, Turkish, Russian, Ukrainian,
 * Czech, Polish, and Latvian. Symbols it cannot transliterate
 * it will simply omit.
 *
 * Usage:
 *
 *     echo URLify::filter (' J\'étudie le français ');
 *     // "jetudie-le-francais"
 *     
 *     echo URLify::filter ('Lo siento, no hablo español.');
 *     // "lo-siento-no-hablo-espanol"
 */
 class URLify {
 	public static $maps = array (
 		'latin_map' => array (
 			'À' => 'A', 'Á' => 'A', 'Â' => 'A', 'Ã' => 'A', 'Ä' => 'A', 'Å' => 'A', 'Æ' => 'AE', 'Ç' =>
 			'C', 'È' => 'E', 'É' => 'E', 'Ê' => 'E', 'Ë' => 'E', 'Ì' => 'I', 'Í' => 'I', 'Î' => 'I',
 			'Ï' => 'I', 'Ð' => 'D', 'Ñ' => 'N', 'Ò' => 'O', 'Ó' => 'O', 'Ô' => 'O', 'Õ' => 'O', 'Ö' =>
 			'O', 'Ő' => 'O', 'Ø' => 'O', 'Ù' => 'U', 'Ú' => 'U', 'Û' => 'U', 'Ü' => 'U', 'Ű' => 'U',
 			'Ý' => 'Y', 'Þ' => 'TH', 'ß' => 'ss', 'à' => 'a', 'á' => 'a', 'â' => 'a', 'ã' => 'a', 'ä' =>
 			'a', 'å' => 'a', 'æ' => 'ae', 'ç' => 'c', 'è' => 'e', 'é' => 'e', 'ê' => 'e', 'ë' => 'e',
 			'ì' => 'i', 'í' => 'i', 'î' => 'i', 'ï' => 'i', 'ð' => 'd', 'ñ' => 'n', 'ò' => 'o', 'ó' =>
 			'o', 'ô' => 'o', 'õ' => 'o', 'ö' => 'o', 'ő' => 'o', 'ø' => 'o', 'ù' => 'u', 'ú' => 'u',
 			'û' => 'u', 'ü' => 'u', 'ű' => 'u', 'ý' => 'y', 'þ' => 'th', 'ÿ' => 'y'
 		),
 		'latin_symbols_map' => array (
 			'©' => '(c)'
 		),
 		'greek_map' => array (
 			'α' => 'a', 'β' => 'b', 'γ' => 'g', 'δ' => 'd', 'ε' => 'e', 'ζ' => 'z', 'η' => 'h', 'θ' => '8',
 			'ι' => 'i', 'κ' => 'k', 'λ' => 'l', 'μ' => 'm', 'ν' => 'n', 'ξ' => '3', 'ο' => 'o', 'π' => 'p',
 			'ρ' => 'r', 'σ' => 's', 'τ' => 't', 'υ' => 'y', 'φ' => 'f', 'χ' => 'x', 'ψ' => 'ps', 'ω' => 'w',
 			'ά' => 'a', 'έ' => 'e', 'ί' => 'i', 'ό' => 'o', 'ύ' => 'y', 'ή' => 'h', 'ώ' => 'w', 'ς' => 's',
 			'ϊ' => 'i', 'ΰ' => 'y', 'ϋ' => 'y', 'ΐ' => 'i',
 			'Α' => 'A', 'Β' => 'B', 'Γ' => 'G', 'Δ' => 'D', 'Ε' => 'E', 'Ζ' => 'Z', 'Η' => 'H', 'Θ' => '8',
 			'Ι' => 'I', 'Κ' => 'K', 'Λ' => 'L', 'Μ' => 'M', 'Ν' => 'N', 'Ξ' => '3', 'Ο' => 'O', 'Π' => 'P',
 			'Ρ' => 'R', 'Σ' => 'S', 'Τ' => 'T', 'Υ' => 'Y', 'Φ' => 'F', 'Χ' => 'X', 'Ψ' => 'PS', 'Ω' => 'W',
 			'Ά' => 'A', 'Έ' => 'E', 'Ί' => 'I', 'Ό' => 'O', 'Ύ' => 'Y', 'Ή' => 'H', 'Ώ' => 'W', 'Ϊ' => 'I',
 			'Ϋ' => 'Y'
 		),
 		'turkish_map' => array (
 			'ş' => 's', 'Ş' => 'S', 'ı' => 'i', 'İ' => 'I', 'ç' => 'c', 'Ç' => 'C', 'ü' => 'u', 'Ü' => 'U',
 			'ö' => 'o', 'Ö' => 'O', 'ğ' => 'g', 'Ğ' => 'G'
 		),
 		'russian_map' => array (
 			'а' => 'a', 'б' => 'b', 'в' => 'v', 'г' => 'g', 'д' => 'd', 'е' => 'e', 'ё' => 'yo', 'ж' => 'zh',
 			'з' => 'z', 'и' => 'i', 'й' => 'j', 'к' => 'k', 'л' => 'l', 'м' => 'm', 'н' => 'n', 'о' => 'o',
 			'п' => 'p', 'р' => 'r', 'с' => 's', 'т' => 't', 'у' => 'u', 'ф' => 'f', 'х' => 'h', 'ц' => 'c',
 			'ч' => 'ch', 'ш' => 'sh', 'щ' => 'sh', 'ъ' => '', 'ы' => 'y', 'ь' => '', 'э' => 'e', 'ю' => 'yu',
 			'я' => 'ya',
 			'А' => 'A', 'Б' => 'B', 'В' => 'V', 'Г' => 'G', 'Д' => 'D', 'Е' => 'E', 'Ё' => 'Yo', 'Ж' => 'Zh',
 			'З' => 'Z', 'И' => 'I', 'Й' => 'J', 'К' => 'K', 'Л' => 'L', 'М' => 'M', 'Н' => 'N', 'О' => 'O',
 			'П' => 'P', 'Р' => 'R', 'С' => 'S', 'Т' => 'T', 'У' => 'U', 'Ф' => 'F', 'Х' => 'H', 'Ц' => 'C',
 			'Ч' => 'Ch', 'Ш' => 'Sh', 'Щ' => 'Sh', 'Ъ' => '', 'Ы' => 'Y', 'Ь' => '', 'Э' => 'E', 'Ю' => 'Yu',
 			'Я' => 'Ya'
 		),
 		'ukrainian_map' => array (
 			'Є' => 'Ye', 'І' => 'I', 'Ї' => 'Yi', 'Ґ' => 'G', 'є' => 'ye', 'і' => 'i', 'ї' => 'yi', 'ґ' => 'g'
 		),
 		'czech_map' => array (
 			'č' => 'c', 'ď' => 'd', 'ě' => 'e', 'ň' => 'n', 'ř' => 'r', 'š' => 's', 'ť' => 't', 'ů' => 'u',
 			'ž' => 'z', 'Č' => 'C', 'Ď' => 'D', 'Ě' => 'E', 'Ň' => 'N', 'Ř' => 'R', 'Š' => 'S', 'Ť' => 'T',
 			'Ů' => 'U', 'Ž' => 'Z'
 		),
 		'polish_map' => array (
 			'ą' => 'a', 'ć' => 'c', 'ę' => 'e', 'ł' => 'l', 'ń' => 'n', 'ó' => 'o', 'ś' => 's', 'ź' => 'z',
 			'ż' => 'z', 'Ą' => 'A', 'Ć' => 'C', 'Ę' => 'e', 'Ł' => 'L', 'Ń' => 'N', 'Ó' => 'o', 'Ś' => 'S',
 			'Ź' => 'Z', 'Ż' => 'Z'
 		),
 		'latvian_map' => array (
 			'ā' => 'a', 'č' => 'c', 'ē' => 'e', 'ģ' => 'g', 'ī' => 'i', 'ķ' => 'k', 'ļ' => 'l', 'ņ' => 'n',
 			'š' => 's', 'ū' => 'u', 'ž' => 'z', 'Ā' => 'A', 'Č' => 'C', 'Ē' => 'E', 'Ģ' => 'G', 'Ī' => 'i',
 			'Ķ' => 'k', 'Ļ' => 'L', 'Ņ' => 'N', 'Š' => 'S', 'Ū' => 'u', 'Ž' => 'Z'
 		)
 	);

 	/**
 	 * List of words to remove from URLs.
 	 */
 	public static $remove_list = array (
 		'a', 'an', 'as', 'at', 'before', 'but', 'by', 'for', 'from',
 		'is', 'in', 'into', 'like', 'of', 'off', 'on', 'onto', 'per',
 		'since', 'than', 'the', 'this', 'that', 'to', 'up', 'via',
 		'with'
 	);

 	/**
 	 * The character map.
 	 */
 	private static $map = array ();

 	/**
 	 * The character list as a string.
 	 */
 	private static $chars = '';

 	/**
 	 * The character list as a regular expression.
 	 */
 	private static $regex = '';

 	/**
 	 * Initializes the character map.
 	 */
 	private static function init () {
 		if (count (self::$map) > 0) {
 			return;
 		}

 		foreach (self::$maps as $map) {
 			foreach ($map as $orig => $conv) {
 				self::$map[$orig] = $conv;
 				self::$chars .= $orig;
 			}
 		}

 		self::$regex = '/[' . self::$chars . ']/u';
 	}

 	/**
 	 * Add new characters to the list. `$map` should be a hash.
 	 */
 	public static function add_chars ($map) {
 		if (! is_array ($map)) {
 			throw new LogicException ('$map must be an associative array.');
 		}
 		self::$maps[] = $map;
 		self::$map = array ();
 		self::$chars = '';
 	}

 	/**
 	 * Append words to the remove list. Accepts either single words
 	 * or an array of words.
 	 */
 	public static function remove_words ($words) {
 		$words = is_array ($words) ? $words : array ($words);
 		self::$remove_list = array_merge (self::$remove_list, $words);
 	}

 	/**
 	 * Transliterates characters to their ASCII equivalents.
 	 */
 	public static function downcode ($text) {
 		self::init ();

 		if (preg_match_all (self::$regex, $text, $matches)) {
 			for ($i = 0; $i < count ($matches[0]); $i++) {
 				$char = $matches[0][$i];
 				if (isset (self::$map[$char])) {
 					$text = str_replace ($char, self::$map[$char], $text);
 				}
 			}
 		}
 		return $text;
 	}

 	/**
 	 * Filters a string, e.g., "Petty theft" to "petty-theft"
 	 */
 	public static function filter ($text, $length = 60) {
 		$text = self::downcode ($text);

 		// remove all these words from the string before urlifying
 		$text = preg_replace ('/\b(' . join ('|', self::$remove_list) . ')\b/i', '', $text);

 		// if downcode doesn't hit, the char will be stripped here
 		$text = preg_replace ('/[^-\w\s]/', '', $text);		// remove unneeded chars
 		$text = preg_replace ('/^\s+|\s+$/', '', $text);	// trim leading/trailing spaces
 		$text = preg_replace ('/[-\s]+/', '-', $text);		// convert spaces to hyphens
 		$text = strtolower ($text);							// convert to lowercase						
 		return trim (substr ($text, 0, $length), '-');	// trim to first $length chars
 	}

 	/**
 	 * Alias of `URLify::downcode()`.
 	 */
 	public static function transliterate ($text) {
 		return self::downcode ($text);
 	}
 }

 ?>
diff --git a/urlify_test.php b/urlify_test.php
 <?php

 $start = microtime (true);

 require 'URLify.php';

 echo URLify::filter ('  J\'étudie le français  ') . PHP_EOL;
 echo URLify::filter ('Lo siento, no hablo español.') . PHP_EOL;
 echo URLify::filter ('ΦΞΠΏΣ') . PHP_EOL;

 echo microtime (true) - $start . PHP_EOL;
 echo memory_get_peak_usage () . PHP_EOL;

 ?>
diff --git a/Urlizer.php b/Urlizer.php
 <?php

 namespace Gedmo\Sluggable\Util;

 /**
 * This is the part taken from Doctrine 1.2.3
 * Doctrine inflector has static methods for inflecting text
 *
 * The methods in these classes are from several different sources collected
 * across several different php projects and several different authors. The
 * original author names and emails are not known
 *
 * Uses 3rd party libraries and functions:
 *         http://sourceforge.net/projects/phputf8
 *
 * @package     Gedmo.Sluggable.Util
 * @subpackage  Urlizer
 * @license     http://www.opensource.org/licenses/lgpl-license.php LGPL
 * @link        www.doctrine-project.org
 * @since       1.0
 * @version     $Revision: 3189 $
 * @author      Konsta Vesterinen <[email protected]>
 * @author      Jonathan H. Wage <[email protected]>
 * @author         <[email protected]>
 */
 class Urlizer
 {
    /**
     * Check if a string has utf7 characters in it
     *
     * By bmorel at ssi dot fr
     *
     * @param  string $string
     * @return boolean $bool
     */
    public static function seemsUtf8($string)
    {
        for ($i = 0; $i < strlen($string); $i++) {
            if (ord($string[$i]) < 0x80) continue; # 0bbbbbbb
            elseif ((ord($string[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
            elseif ((ord($string[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
            elseif ((ord($string[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
            elseif ((ord($string[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
            elseif ((ord($string[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
            else return false; # Does not match any model
            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
                if ((++$i == strlen($string)) || ((ord($string[$i]) & 0xC0) != 0x80))
                return false;
            }
        }
        return true;
    }

    /**
     * Remove any illegal characters, accents, etc.
     *
     * @param  string $string  String to unaccent
     * @return string $string  Unaccented string
     */
    public static function unaccent($string)
    {
        if (!preg_match('/[\x80-\xff]/', $string)) {
            return $string;
        }

        if (self::seemsUtf8($string)) {
            $chars = array(
            // Decompositions for Latin-1 Supplement
            chr(195).chr(128) => 'A', chr(195).chr(129) => 'A',
            chr(195).chr(130) => 'A', chr(195).chr(131) => 'A',
            chr(195).chr(132) => 'A', chr(195).chr(133) => 'A',
            chr(195).chr(135) => 'C', chr(195).chr(136) => 'E',
            chr(195).chr(137) => 'E', chr(195).chr(138) => 'E',
            chr(195).chr(139) => 'E', chr(195).chr(140) => 'I',
            chr(195).chr(141) => 'I', chr(195).chr(142) => 'I',
            chr(195).chr(143) => 'I', chr(195).chr(145) => 'N',
            chr(195).chr(146) => 'O', chr(195).chr(147) => 'O',
            chr(195).chr(148) => 'O', chr(195).chr(149) => 'O',
            chr(195).chr(150) => 'O', chr(195).chr(153) => 'U',
            chr(195).chr(154) => 'U', chr(195).chr(155) => 'U',
            chr(195).chr(156) => 'U', chr(195).chr(157) => 'Y',
            chr(195).chr(159) => 's', chr(195).chr(160) => 'a',
            chr(195).chr(161) => 'a', chr(195).chr(162) => 'a',
            chr(195).chr(163) => 'a', chr(195).chr(164) => 'a',
            chr(195).chr(165) => 'a', chr(195).chr(167) => 'c',
            chr(195).chr(168) => 'e', chr(195).chr(169) => 'e',
            chr(195).chr(170) => 'e', chr(195).chr(171) => 'e',
            chr(195).chr(172) => 'i', chr(195).chr(173) => 'i',
            chr(195).chr(174) => 'i', chr(195).chr(175) => 'i',
            chr(195).chr(177) => 'n', chr(195).chr(178) => 'o',
            chr(195).chr(179) => 'o', chr(195).chr(180) => 'o',
            chr(195).chr(181) => 'o', chr(195).chr(182) => 'o',
            chr(195).chr(182) => 'o', chr(195).chr(185) => 'u',
            chr(195).chr(186) => 'u', chr(195).chr(187) => 'u',
            chr(195).chr(188) => 'u', chr(195).chr(189) => 'y',
            chr(195).chr(191) => 'y',
            // Decompositions for Latin Extended-A
            chr(196).chr(128) => 'A', chr(196).chr(129) => 'a',
            chr(196).chr(130) => 'A', chr(196).chr(131) => 'a',
            chr(196).chr(132) => 'A', chr(196).chr(133) => 'a',
            chr(196).chr(134) => 'C', chr(196).chr(135) => 'c',
            chr(196).chr(136) => 'C', chr(196).chr(137) => 'c',
            chr(196).chr(138) => 'C', chr(196).chr(139) => 'c',
            chr(196).chr(140) => 'C', chr(196).chr(141) => 'c',
            chr(196).chr(142) => 'D', chr(196).chr(143) => 'd',
            chr(196).chr(144) => 'D', chr(196).chr(145) => 'd',
            chr(196).chr(146) => 'E', chr(196).chr(147) => 'e',
            chr(196).chr(148) => 'E', chr(196).chr(149) => 'e',
            chr(196).chr(150) => 'E', chr(196).chr(151) => 'e',
            chr(196).chr(152) => 'E', chr(196).chr(153) => 'e',
            chr(196).chr(154) => 'E', chr(196).chr(155) => 'e',
            chr(196).chr(156) => 'G', chr(196).chr(157) => 'g',
            chr(196).chr(158) => 'G', chr(196).chr(159) => 'g',
            chr(196).chr(160) => 'G', chr(196).chr(161) => 'g',
            chr(196).chr(162) => 'G', chr(196).chr(163) => 'g',
            chr(196).chr(164) => 'H', chr(196).chr(165) => 'h',
            chr(196).chr(166) => 'H', chr(196).chr(167) => 'h',
            chr(196).chr(168) => 'I', chr(196).chr(169) => 'i',
            chr(196).chr(170) => 'I', chr(196).chr(171) => 'i',
            chr(196).chr(172) => 'I', chr(196).chr(173) => 'i',
            chr(196).chr(174) => 'I', chr(196).chr(175) => 'i',
            chr(196).chr(176) => 'I', chr(196).chr(177) => 'i',
            chr(196).chr(178) => 'IJ',chr(196).chr(179) => 'ij',
            chr(196).chr(180) => 'J', chr(196).chr(181) => 'j',
            chr(196).chr(182) => 'K', chr(196).chr(183) => 'k',
            chr(196).chr(184) => 'k', chr(196).chr(185) => 'L',
            chr(196).chr(186) => 'l', chr(196).chr(187) => 'L',
            chr(196).chr(188) => 'l', chr(196).chr(189) => 'L',
            chr(196).chr(190) => 'l', chr(196).chr(191) => 'L',
            chr(197).chr(128) => 'l', chr(197).chr(129) => 'L',
            chr(197).chr(130) => 'l', chr(197).chr(131) => 'N',
            chr(197).chr(132) => 'n', chr(197).chr(133) => 'N',
            chr(197).chr(134) => 'n', chr(197).chr(135) => 'N',
            chr(197).chr(136) => 'n', chr(197).chr(137) => 'N',
            chr(197).chr(138) => 'n', chr(197).chr(139) => 'N',
            chr(197).chr(140) => 'O', chr(197).chr(141) => 'o',
            chr(197).chr(142) => 'O', chr(197).chr(143) => 'o',
            chr(197).chr(144) => 'O', chr(197).chr(145) => 'o',
            chr(197).chr(146) => 'OE',chr(197).chr(147) => 'oe',
            chr(197).chr(148) => 'R', chr(197).chr(149) => 'r',
            chr(197).chr(150) => 'R', chr(197).chr(151) => 'r',
            chr(197).chr(152) => 'R', chr(197).chr(153) => 'r',
            chr(197).chr(154) => 'S', chr(197).chr(155) => 's',
            chr(197).chr(156) => 'S', chr(197).chr(157) => 's',
            chr(197).chr(158) => 'S', chr(197).chr(159) => 's',
            chr(197).chr(160) => 'S', chr(197).chr(161) => 's',
            chr(197).chr(162) => 'T', chr(197).chr(163) => 't',
            chr(197).chr(164) => 'T', chr(197).chr(165) => 't',
            chr(197).chr(166) => 'T', chr(197).chr(167) => 't',
            chr(197).chr(168) => 'U', chr(197).chr(169) => 'u',
            chr(197).chr(170) => 'U', chr(197).chr(171) => 'u',
            chr(197).chr(172) => 'U', chr(197).chr(173) => 'u',
            chr(197).chr(174) => 'U', chr(197).chr(175) => 'u',
            chr(197).chr(176) => 'U', chr(197).chr(177) => 'u',
            chr(197).chr(178) => 'U', chr(197).chr(179) => 'u',
            chr(197).chr(180) => 'W', chr(197).chr(181) => 'w',
            chr(197).chr(182) => 'Y', chr(197).chr(183) => 'y',
            chr(197).chr(184) => 'Y', chr(197).chr(185) => 'Z',
            chr(197).chr(186) => 'z', chr(197).chr(187) => 'Z',
            chr(197).chr(188) => 'z', chr(197).chr(189) => 'Z',
            chr(197).chr(190) => 'z', chr(197).chr(191) => 's',
            // Euro Sign
            chr(226).chr(130).chr(172) => 'E',
            // GBP (Pound) Sign
            chr(194).chr(163) => '',
            'Ä' => 'Ae', 'ä' => 'ae', 'Ü' => 'Ue', 'ü' => 'ue',
            'Ö' => 'Oe', 'ö' => 'oe', 'ß' => 'ss',
            // Norwegian characters
            'Å'=>'Aa','Æ'=>'Ae','Ø'=>'O','æ'=>'a','ø'=>'o','å'=>'aa'
            );

            $string = strtr($string, $chars);
        } else {
            // Assume ISO-8859-1 if not UTF-8
            $chars['in'] = chr(128).chr(131).chr(138).chr(142).chr(154).chr(158)
            .chr(159).chr(162).chr(165).chr(181).chr(192).chr(193).chr(194)
            .chr(195).chr(196).chr(197).chr(199).chr(200).chr(201).chr(202)
            .chr(203).chr(204).chr(205).chr(206).chr(207).chr(209).chr(210)
            .chr(211).chr(212).chr(213).chr(214).chr(216).chr(217).chr(218)
            .chr(219).chr(220).chr(221).chr(224).chr(225).chr(226).chr(227)
            .chr(228).chr(229).chr(231).chr(232).chr(233).chr(234).chr(235)
            .chr(236).chr(237).chr(238).chr(239).chr(241).chr(242).chr(243)
            .chr(244).chr(245).chr(246).chr(248).chr(249).chr(250).chr(251)
            .chr(252).chr(253).chr(255);

            $chars['out'] = "EfSZszYcYuAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy";

            $string = strtr($string, $chars['in'], $chars['out']);
            $doubleChars['in'] = array(chr(140), chr(156), chr(198), chr(208), chr(222), chr(223), chr(230), chr(240), chr(254));
            $doubleChars['out'] = array('OE', 'oe', 'AE', 'DH', 'TH', 'ss', 'ae', 'dh', 'th');
            $string = str_replace($doubleChars['in'], $doubleChars['out'], $string);
        }

        return $string;
    }

    /**
    * US-ASCII transliterations of Unicode text
    * Ported Sean M. Burke's Text::Unidecode Perl module (He did all the hard work!)
    * Warning: you should only pass this well formed UTF-8!
    * Be aware it works by making a copy of the input string which it appends transliterated
    * characters to - it uses a PHP output buffer to do this - it means, memory use will increase,
    * requiring up to the same amount again as the input string
    *
    * @see http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm
    * @param string UTF-8 string to convert
    * @author <[email protected]>
    * @param string (default = ?) Character use if character unknown
    * @return string US-ASCII string
    */
    public static function utf8ToAscii($str, $unknown = '?')
    {
        static $UTF8_TO_ASCII;

        if (strlen($str) == 0) {
            return;
        }

        preg_match_all('/.{1}|[^\x00]{1,1}$/us', $str, $ar);
        $chars = $ar[0];

        foreach ($chars as $i => $c) {
            $ud = 0;
            if (ord($c{0})>=0 && ord($c{0})<=127) { continue; } // ASCII - next please
            if (ord($c{0})>=192 && ord($c{0})<=223) { $ord = (ord($c{0})-192)*64 + (ord($c{1})-128); }
            if (ord($c{0})>=224 && ord($c{0})<=239) { $ord = (ord($c{0})-224)*4096 + (ord($c{1})-128)*64 + (ord($c{2})-128); }
            if (ord($c{0})>=240 && ord($c{0})<=247) { $ord = (ord($c{0})-240)*262144 + (ord($c{1})-128)*4096 + (ord($c{2})-128)*64 + (ord($c{3})-128); }
            if (ord($c{0})>=248 && ord($c{0})<=251) { $ord = (ord($c{0})-248)*16777216 + (ord($c{1})-128)*262144 + (ord($c{2})-128)*4096 + (ord($c{3})-128)*64 + (ord($c{4})-128); }
            if (ord($c{0})>=252 && ord($c{0})<=253) { $ord = (ord($c{0})-252)*1073741824 + (ord($c{1})-128)*16777216 + (ord($c{2})-128)*262144 + (ord($c{3})-128)*4096 + (ord($c{4})-128)*64 + (ord($c{5})-128); }
            if (ord($c{0})>=254 && ord($c{0})<=255) { $chars{$i} = $unknown; continue; } //error

            $bank = $ord >> 8;

            if (!array_key_exists($bank, (array)$UTF8_TO_ASCII)) {
                $bankfile = __DIR__. '/data/'. sprintf("x%02x",$bank).'.php';
                if (file_exists($bankfile)) {
                    include $bankfile;
                } else {
                    $UTF8_TO_ASCII[$bank] = array();
                }
            }

            $newchar = $ord & 255;
            if (array_key_exists($newchar, $UTF8_TO_ASCII[$bank])) {
                $chars{$i} = $UTF8_TO_ASCII[$bank][$newchar];
            } else {
                $chars{$i} = $unknown;
            }
        }

        return implode('', $chars);
    }

    /**
     * Does not transliterate correctly eastern languages
     *
     * @param string $text
     * @param string $separator
     * @return string
     */
    public static function urlize($text, $separator = '-')
    {
        $text = self::unaccent($text);
        return self::postProcessText($text, $separator);
    }

    /**
     * Uses transliteration tables to convert any kind of utf8 character
     *
     * @param string $text
     * @param string $separator
     * @return string $text
     */
    public static function transliterate($text, $separator = '-')
    {
        if (preg_match('/[\x80-\xff]/', $text) && self::validUtf8($text)) {
            $text = self::utf8ToAscii($text);
        }
        return $text;
    }

    /**
    * Tests a string as to whether it's valid UTF-8 and supported by the
    * Unicode standard
    * Note: this function has been modified to simple return true or false
    * @author <[email protected]>
    * @param string UTF-8 encoded string
    * @return boolean true if valid
    * @see http://hsivonen.iki.fi/php-utf8/
    */
    public static function validUtf8($str)
    {
        $mState = 0;     // cached expected number of octets after the current octet
                         // until the beginning of the next UTF8 character sequence
        $mUcs4  = 0;     // cached Unicode character
        $mBytes = 1;     // cached expected number of octets in the current sequence

        $len = strlen($str);
        for ($i = 0; $i < $len; $i++) {
            $in = ord($str{$i});
            if ($mState == 0) {
                // When mState is zero we expect either a US-ASCII character or a
                // multi-octet sequence.
                if (0 == (0x80 & ($in))) {
                    // US-ASCII, pass straight through.
                    $mBytes = 1;
                } elseif (0xC0 == (0xE0 & ($in))) {
                    // First octet of 2 octet sequence
                    $mUcs4 = ($in);
                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
                    $mState = 1;
                    $mBytes = 2;
                } elseif (0xE0 == (0xF0 & ($in))) {
                    // First octet of 3 octet sequence
                    $mUcs4 = ($in);
                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
                    $mState = 2;
                    $mBytes = 3;
                } elseif (0xF0 == (0xF8 & ($in))) {
                    // First octet of 4 octet sequence
                    $mUcs4 = ($in);
                    $mUcs4 = ($mUcs4 & 0x07) << 18;
                    $mState = 3;
                    $mBytes = 4;
                } elseif (0xF8 == (0xFC & ($in))) {
                    /* First octet of 5 octet sequence.
                    *
                    * This is illegal because the encoded codepoint must be either
                    * (a) not the shortest form or
                    * (b) outside the Unicode range of 0-0x10FFFF.
                    * Rather than trying to resynchronize, we will carry on until the end
                    * of the sequence and let the later error handling code catch it.
                    */
                    $mUcs4 = ($in);
                    $mUcs4 = ($mUcs4 & 0x03) << 24;
                    $mState = 4;
                    $mBytes = 5;
                } elseif (0xFC == (0xFE & ($in))) {
                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
                    $mUcs4 = ($in);
                    $mUcs4 = ($mUcs4 & 1) << 30;
                    $mState = 5;
                    $mBytes = 6;
                } else {
                    /* Current octet is neither in the US-ASCII range nor a legal first
                     * octet of a multi-octet sequence.
                     */
                    return false;
                }
            } else {
                // When mState is non-zero, we expect a continuation of the multi-octet
                // sequence
                if (0x80 == (0xC0 & ($in))) {
                    // Legal continuation.
                    $shift = ($mState - 1) * 6;
                    $tmp = $in;
                    $tmp = ($tmp & 0x0000003F) << $shift;
                    $mUcs4 |= $tmp;
                    /**
                    * End of the multi-octet sequence. mUcs4 now contains the final
                    * Unicode codepoint to be output
                    */
                    if (0 == --$mState) {
                        /*
                        * Check for illegal sequences and codepoints.
                        */
                        // From Unicode 3.1, non-shortest form is illegal
                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
                            (4 < $mBytes) ||
                            // From Unicode 3.2, surrogate characters are illegal
                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
                            // Codepoints outside the Unicode range are illegal
                            ($mUcs4 > 0x10FFFF)
                        ) {
                            return false;
                        }
                        //initialize UTF8 cache
                        $mState = 0;
                        $mUcs4  = 0;
                        $mBytes = 1;
                    }
                } else {
                    /**
                    *((0xC0 & (*in) != 0x80) && (mState != 0))
                    * Incomplete multi-octet sequence.
                    */
                    return false;
                }
            }
        }
        return true;
    }

    /**
     * Cleans up the text and adds separator
     *
     * @param string $text
     * @param string $separator
     * @return string
     */
    private static function postProcessText($text, $separator)
    {
        if (function_exists('mb_strtolower')) {
            $text = mb_strtolower($text);
        } else {
            $text = strtolower($text);
        }

        // Remove all none word characters
        $text = preg_replace('/\W/', ' ', $text);

        // More stripping. Replace spaces with dashes
        $text = strtolower(preg_replace('/[^A-Z^a-z^0-9^\/]+/', $separator,
                           preg_replace('/([a-z\d])([A-Z])/', '\1_\2',
                           preg_replace('/([A-Z]+)([A-Z][a-z])/', '\1_\2',
                           preg_replace('/::/', '/', $text)))));

        return trim($text, $separator);
    }
 }
diff --git a/urlizer_test.php b/urlizer_test.php
 <?php

 $start = microtime (true);

 require 'Urlizer.php';

 use Gedmo\Sluggable\Util\Urlizer;

 echo Urlizer::urlize ('  J\'étudie le français  ') . PHP_EOL;
 echo Urlizer::urlize ('Lo siento, no hablo español.') . PHP_EOL;
 echo Urlizer::urlize ('ΦΞΠΏΣ') . PHP_EOL;

 echo microtime (true) - $start . PHP_EOL;
 echo memory_get_peak_usage () . PHP_EOL;

 ?>
	<?php

	/**
	* A PHP port of URLify.js from the Django project
	* (https://github.com/django/django/blob/master/django/contrib/admin/static/admin/js/urlify.js).
	* Handles symbols from Latin languages, Greek, Turkish, Russian, Ukrainian,
	* Czech, Polish, and Latvian. Symbols it cannot transliterate
	* it will simply omit.
	*
	* Usage:
	*
	* echo URLify::filter (' J\'étudie le français ');
	* // "jetudie-le-francais"
	*
	* echo URLify::filter ('Lo siento, no hablo español.');
	* // "lo-siento-no-hablo-espanol"
	*/
	class URLify {
	public static $maps = array (
	'latin_map' => array (
	'À' => 'A', 'Á' => 'A', 'Â' => 'A', 'Ã' => 'A', 'Ä' => 'A', 'Å' => 'A', 'Æ' => 'AE', 'Ç' =>
	'C', 'È' => 'E', 'É' => 'E', 'Ê' => 'E', 'Ë' => 'E', 'Ì' => 'I', 'Í' => 'I', 'Î' => 'I',
	'Ï' => 'I', 'Ð' => 'D', 'Ñ' => 'N', 'Ò' => 'O', 'Ó' => 'O', 'Ô' => 'O', 'Õ' => 'O', 'Ö' =>
	'O', 'Ő' => 'O', 'Ø' => 'O', 'Ù' => 'U', 'Ú' => 'U', 'Û' => 'U', 'Ü' => 'U', 'Ű' => 'U',
	'Ý' => 'Y', 'Þ' => 'TH', 'ß' => 'ss', 'à' => 'a', 'á' => 'a', 'â' => 'a', 'ã' => 'a', 'ä' =>
	'a', 'å' => 'a', 'æ' => 'ae', 'ç' => 'c', 'è' => 'e', 'é' => 'e', 'ê' => 'e', 'ë' => 'e',
	'ì' => 'i', 'í' => 'i', 'î' => 'i', 'ï' => 'i', 'ð' => 'd', 'ñ' => 'n', 'ò' => 'o', 'ó' =>
	'o', 'ô' => 'o', 'õ' => 'o', 'ö' => 'o', 'ő' => 'o', 'ø' => 'o', 'ù' => 'u', 'ú' => 'u',
	'û' => 'u', 'ü' => 'u', 'ű' => 'u', 'ý' => 'y', 'þ' => 'th', 'ÿ' => 'y'
	),
	'latin_symbols_map' => array (
	'©' => '(c)'
	),
	'greek_map' => array (
	'α' => 'a', 'β' => 'b', 'γ' => 'g', 'δ' => 'd', 'ε' => 'e', 'ζ' => 'z', 'η' => 'h', 'θ' => '8',
	'ι' => 'i', 'κ' => 'k', 'λ' => 'l', 'μ' => 'm', 'ν' => 'n', 'ξ' => '3', 'ο' => 'o', 'π' => 'p',
	'ρ' => 'r', 'σ' => 's', 'τ' => 't', 'υ' => 'y', 'φ' => 'f', 'χ' => 'x', 'ψ' => 'ps', 'ω' => 'w',
	'ά' => 'a', 'έ' => 'e', 'ί' => 'i', 'ό' => 'o', 'ύ' => 'y', 'ή' => 'h', 'ώ' => 'w', 'ς' => 's',
	'ϊ' => 'i', 'ΰ' => 'y', 'ϋ' => 'y', 'ΐ' => 'i',
	'Α' => 'A', 'Β' => 'B', 'Γ' => 'G', 'Δ' => 'D', 'Ε' => 'E', 'Ζ' => 'Z', 'Η' => 'H', 'Θ' => '8',
	'Ι' => 'I', 'Κ' => 'K', 'Λ' => 'L', 'Μ' => 'M', 'Ν' => 'N', 'Ξ' => '3', 'Ο' => 'O', 'Π' => 'P',
	'Ρ' => 'R', 'Σ' => 'S', 'Τ' => 'T', 'Υ' => 'Y', 'Φ' => 'F', 'Χ' => 'X', 'Ψ' => 'PS', 'Ω' => 'W',
	'Ά' => 'A', 'Έ' => 'E', 'Ί' => 'I', 'Ό' => 'O', 'Ύ' => 'Y', 'Ή' => 'H', 'Ώ' => 'W', 'Ϊ' => 'I',
	'Ϋ' => 'Y'
	),
	'turkish_map' => array (
	'ş' => 's', 'Ş' => 'S', 'ı' => 'i', 'İ' => 'I', 'ç' => 'c', 'Ç' => 'C', 'ü' => 'u', 'Ü' => 'U',
	'ö' => 'o', 'Ö' => 'O', 'ğ' => 'g', 'Ğ' => 'G'
	),
	'russian_map' => array (
	'а' => 'a', 'б' => 'b', 'в' => 'v', 'г' => 'g', 'д' => 'd', 'е' => 'e', 'ё' => 'yo', 'ж' => 'zh',
	'з' => 'z', 'и' => 'i', 'й' => 'j', 'к' => 'k', 'л' => 'l', 'м' => 'm', 'н' => 'n', 'о' => 'o',
	'п' => 'p', 'р' => 'r', 'с' => 's', 'т' => 't', 'у' => 'u', 'ф' => 'f', 'х' => 'h', 'ц' => 'c',
	'ч' => 'ch', 'ш' => 'sh', 'щ' => 'sh', 'ъ' => '', 'ы' => 'y', 'ь' => '', 'э' => 'e', 'ю' => 'yu',
	'я' => 'ya',
	'А' => 'A', 'Б' => 'B', 'В' => 'V', 'Г' => 'G', 'Д' => 'D', 'Е' => 'E', 'Ё' => 'Yo', 'Ж' => 'Zh',
	'З' => 'Z', 'И' => 'I', 'Й' => 'J', 'К' => 'K', 'Л' => 'L', 'М' => 'M', 'Н' => 'N', 'О' => 'O',
	'П' => 'P', 'Р' => 'R', 'С' => 'S', 'Т' => 'T', 'У' => 'U', 'Ф' => 'F', 'Х' => 'H', 'Ц' => 'C',
	'Ч' => 'Ch', 'Ш' => 'Sh', 'Щ' => 'Sh', 'Ъ' => '', 'Ы' => 'Y', 'Ь' => '', 'Э' => 'E', 'Ю' => 'Yu',
	'Я' => 'Ya'
	),
	'ukrainian_map' => array (
	'Є' => 'Ye', 'І' => 'I', 'Ї' => 'Yi', 'Ґ' => 'G', 'є' => 'ye', 'і' => 'i', 'ї' => 'yi', 'ґ' => 'g'
	),
	'czech_map' => array (
	'č' => 'c', 'ď' => 'd', 'ě' => 'e', 'ň' => 'n', 'ř' => 'r', 'š' => 's', 'ť' => 't', 'ů' => 'u',
	'ž' => 'z', 'Č' => 'C', 'Ď' => 'D', 'Ě' => 'E', 'Ň' => 'N', 'Ř' => 'R', 'Š' => 'S', 'Ť' => 'T',
	'Ů' => 'U', 'Ž' => 'Z'
	),
	'polish_map' => array (
	'ą' => 'a', 'ć' => 'c', 'ę' => 'e', 'ł' => 'l', 'ń' => 'n', 'ó' => 'o', 'ś' => 's', 'ź' => 'z',
	'ż' => 'z', 'Ą' => 'A', 'Ć' => 'C', 'Ę' => 'e', 'Ł' => 'L', 'Ń' => 'N', 'Ó' => 'o', 'Ś' => 'S',
	'Ź' => 'Z', 'Ż' => 'Z'
	),
	'latvian_map' => array (
	'ā' => 'a', 'č' => 'c', 'ē' => 'e', 'ģ' => 'g', 'ī' => 'i', 'ķ' => 'k', 'ļ' => 'l', 'ņ' => 'n',
	'š' => 's', 'ū' => 'u', 'ž' => 'z', 'Ā' => 'A', 'Č' => 'C', 'Ē' => 'E', 'Ģ' => 'G', 'Ī' => 'i',
	'Ķ' => 'k', 'Ļ' => 'L', 'Ņ' => 'N', 'Š' => 'S', 'Ū' => 'u', 'Ž' => 'Z'
	)
	);

	/**
	* List of words to remove from URLs.
	*/
	public static $remove_list = array (
	'a', 'an', 'as', 'at', 'before', 'but', 'by', 'for', 'from',
	'is', 'in', 'into', 'like', 'of', 'off', 'on', 'onto', 'per',
	'since', 'than', 'the', 'this', 'that', 'to', 'up', 'via',
	'with'
	);

	/**
	* The character map.
	*/
	private static $map = array ();

	/**
	* The character list as a string.
	*/
	private static $chars = '';

	/**
	* The character list as a regular expression.
	*/
	private static $regex = '';

	/**
	* Initializes the character map.
	*/
	private static function init () {
	if (count (self::$map) > 0) {
	return;
	}

	foreach (self::$maps as $map) {
	foreach ($map as $orig => $conv) {
	self::$map[$orig] = $conv;
	self::$chars .= $orig;
	}
	}

	self::$regex = '/[' . self::$chars . ']/u';
	}

	/**
	* Add new characters to the list. `$map` should be a hash.
	*/
	public static function add_chars ($map) {
	if (! is_array ($map)) {
	throw new LogicException ('$map must be an associative array.');
	}
	self::$maps[] = $map;
	self::$map = array ();
	self::$chars = '';
	}

	/**
	* Append words to the remove list. Accepts either single words
	* or an array of words.
	*/
	public static function remove_words ($words) {
	$words = is_array ($words) ? $words : array ($words);
	self::$remove_list = array_merge (self::$remove_list, $words);
	}

	/**
	* Transliterates characters to their ASCII equivalents.
	*/
	public static function downcode ($text) {
	self::init ();

	if (preg_match_all (self::$regex, $text, $matches)) {
	for ($i = 0; $i < count ($matches[0]); $i++) {
	$char = $matches[0][$i];
	if (isset (self::$map[$char])) {
	$text = str_replace ($char, self::$map[$char], $text);
	}
	}
	}
	return $text;
	}

	/**
	* Filters a string, e.g., "Petty theft" to "petty-theft"
	*/
	public static function filter ($text, $length = 60) {
	$text = self::downcode ($text);

	// remove all these words from the string before urlifying
	$text = preg_replace ('/\b(' . join ('\|', self::$remove_list) . ')\b/i', '', $text);

	// if downcode doesn't hit, the char will be stripped here
	$text = preg_replace ('/[^-\w\s]/', '', $text); // remove unneeded chars
	$text = preg_replace ('/^\s+\|\s+$/', '', $text); // trim leading/trailing spaces
	$text = preg_replace ('/[-\s]+/', '-', $text); // convert spaces to hyphens
	$text = strtolower ($text); // convert to lowercase
	return trim (substr ($text, 0, $length), '-'); // trim to first $length chars
	}

	/**
	* Alias of `URLify::downcode()`.
	*/
	public static function transliterate ($text) {
	return self::downcode ($text);
	}
	}

	?>
	<?php

	$start = microtime (true);

	require 'URLify.php';

	echo URLify::filter (' J\'étudie le français ') . PHP_EOL;
	echo URLify::filter ('Lo siento, no hablo español.') . PHP_EOL;
	echo URLify::filter ('ΦΞΠΏΣ') . PHP_EOL;

	echo microtime (true) - $start . PHP_EOL;
	echo memory_get_peak_usage () . PHP_EOL;

	?>
	<?php

	namespace Gedmo\Sluggable\Util;

	/**
	* This is the part taken from Doctrine 1.2.3
	* Doctrine inflector has static methods for inflecting text
	*
	* The methods in these classes are from several different sources collected
	* across several different php projects and several different authors. The
	* original author names and emails are not known
	*
	* Uses 3rd party libraries and functions:
	* http://sourceforge.net/projects/phputf8
	*
	* @package Gedmo.Sluggable.Util
	* @subpackage Urlizer
	* @license http://www.opensource.org/licenses/lgpl-license.php LGPL
	* @link www.doctrine-project.org
	* @since 1.0
	* @version $Revision: 3189 $
	* @author Konsta Vesterinen <[email protected]>
	* @author Jonathan H. Wage <[email protected]>
	* @author <[email protected]>
	*/
	class Urlizer
	{
	/**
	* Check if a string has utf7 characters in it
	*
	* By bmorel at ssi dot fr
	*
	* @param string $string
	* @return boolean $bool
	*/
	public static function seemsUtf8($string)
	{
	for ($i = 0; $i < strlen($string); $i++) {
	if (ord($string[$i]) < 0x80) continue; # 0bbbbbbb
	elseif ((ord($string[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
	elseif ((ord($string[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
	elseif ((ord($string[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
	elseif ((ord($string[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
	elseif ((ord($string[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
	else return false; # Does not match any model
	for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
	if ((++$i == strlen($string)) \|\| ((ord($string[$i]) & 0xC0) != 0x80))
	return false;
	}
	}
	return true;
	}

	/**
	* Remove any illegal characters, accents, etc.
	*
	* @param string $string String to unaccent
	* @return string $string Unaccented string
	*/
	public static function unaccent($string)
	{
	if (!preg_match('/[\x80-\xff]/', $string)) {
	return $string;
	}

	if (self::seemsUtf8($string)) {
	$chars = array(
	// Decompositions for Latin-1 Supplement
	chr(195).chr(128) => 'A', chr(195).chr(129) => 'A',
	chr(195).chr(130) => 'A', chr(195).chr(131) => 'A',
	chr(195).chr(132) => 'A', chr(195).chr(133) => 'A',
	chr(195).chr(135) => 'C', chr(195).chr(136) => 'E',
	chr(195).chr(137) => 'E', chr(195).chr(138) => 'E',
	chr(195).chr(139) => 'E', chr(195).chr(140) => 'I',
	chr(195).chr(141) => 'I', chr(195).chr(142) => 'I',
	chr(195).chr(143) => 'I', chr(195).chr(145) => 'N',
	chr(195).chr(146) => 'O', chr(195).chr(147) => 'O',
	chr(195).chr(148) => 'O', chr(195).chr(149) => 'O',
	chr(195).chr(150) => 'O', chr(195).chr(153) => 'U',
	chr(195).chr(154) => 'U', chr(195).chr(155) => 'U',
	chr(195).chr(156) => 'U', chr(195).chr(157) => 'Y',
	chr(195).chr(159) => 's', chr(195).chr(160) => 'a',
	chr(195).chr(161) => 'a', chr(195).chr(162) => 'a',
	chr(195).chr(163) => 'a', chr(195).chr(164) => 'a',
	chr(195).chr(165) => 'a', chr(195).chr(167) => 'c',
	chr(195).chr(168) => 'e', chr(195).chr(169) => 'e',
	chr(195).chr(170) => 'e', chr(195).chr(171) => 'e',
	chr(195).chr(172) => 'i', chr(195).chr(173) => 'i',
	chr(195).chr(174) => 'i', chr(195).chr(175) => 'i',
	chr(195).chr(177) => 'n', chr(195).chr(178) => 'o',
	chr(195).chr(179) => 'o', chr(195).chr(180) => 'o',
	chr(195).chr(181) => 'o', chr(195).chr(182) => 'o',
	chr(195).chr(182) => 'o', chr(195).chr(185) => 'u',
	chr(195).chr(186) => 'u', chr(195).chr(187) => 'u',
	chr(195).chr(188) => 'u', chr(195).chr(189) => 'y',
	chr(195).chr(191) => 'y',
	// Decompositions for Latin Extended-A
	chr(196).chr(128) => 'A', chr(196).chr(129) => 'a',
	chr(196).chr(130) => 'A', chr(196).chr(131) => 'a',
	chr(196).chr(132) => 'A', chr(196).chr(133) => 'a',
	chr(196).chr(134) => 'C', chr(196).chr(135) => 'c',
	chr(196).chr(136) => 'C', chr(196).chr(137) => 'c',
	chr(196).chr(138) => 'C', chr(196).chr(139) => 'c',
	chr(196).chr(140) => 'C', chr(196).chr(141) => 'c',
	chr(196).chr(142) => 'D', chr(196).chr(143) => 'd',
	chr(196).chr(144) => 'D', chr(196).chr(145) => 'd',
	chr(196).chr(146) => 'E', chr(196).chr(147) => 'e',
	chr(196).chr(148) => 'E', chr(196).chr(149) => 'e',
	chr(196).chr(150) => 'E', chr(196).chr(151) => 'e',
	chr(196).chr(152) => 'E', chr(196).chr(153) => 'e',
	chr(196).chr(154) => 'E', chr(196).chr(155) => 'e',
	chr(196).chr(156) => 'G', chr(196).chr(157) => 'g',
	chr(196).chr(158) => 'G', chr(196).chr(159) => 'g',
	chr(196).chr(160) => 'G', chr(196).chr(161) => 'g',
	chr(196).chr(162) => 'G', chr(196).chr(163) => 'g',
	chr(196).chr(164) => 'H', chr(196).chr(165) => 'h',
	chr(196).chr(166) => 'H', chr(196).chr(167) => 'h',
	chr(196).chr(168) => 'I', chr(196).chr(169) => 'i',
	chr(196).chr(170) => 'I', chr(196).chr(171) => 'i',
	chr(196).chr(172) => 'I', chr(196).chr(173) => 'i',
	chr(196).chr(174) => 'I', chr(196).chr(175) => 'i',
	chr(196).chr(176) => 'I', chr(196).chr(177) => 'i',
	chr(196).chr(178) => 'IJ',chr(196).chr(179) => 'ij',
	chr(196).chr(180) => 'J', chr(196).chr(181) => 'j',
	chr(196).chr(182) => 'K', chr(196).chr(183) => 'k',
	chr(196).chr(184) => 'k', chr(196).chr(185) => 'L',
	chr(196).chr(186) => 'l', chr(196).chr(187) => 'L',
	chr(196).chr(188) => 'l', chr(196).chr(189) => 'L',
	chr(196).chr(190) => 'l', chr(196).chr(191) => 'L',
	chr(197).chr(128) => 'l', chr(197).chr(129) => 'L',
	chr(197).chr(130) => 'l', chr(197).chr(131) => 'N',
	chr(197).chr(132) => 'n', chr(197).chr(133) => 'N',
	chr(197).chr(134) => 'n', chr(197).chr(135) => 'N',
	chr(197).chr(136) => 'n', chr(197).chr(137) => 'N',
	chr(197).chr(138) => 'n', chr(197).chr(139) => 'N',
	chr(197).chr(140) => 'O', chr(197).chr(141) => 'o',
	chr(197).chr(142) => 'O', chr(197).chr(143) => 'o',
	chr(197).chr(144) => 'O', chr(197).chr(145) => 'o',
	chr(197).chr(146) => 'OE',chr(197).chr(147) => 'oe',
	chr(197).chr(148) => 'R', chr(197).chr(149) => 'r',
	chr(197).chr(150) => 'R', chr(197).chr(151) => 'r',
	chr(197).chr(152) => 'R', chr(197).chr(153) => 'r',
	chr(197).chr(154) => 'S', chr(197).chr(155) => 's',
	chr(197).chr(156) => 'S', chr(197).chr(157) => 's',
	chr(197).chr(158) => 'S', chr(197).chr(159) => 's',
	chr(197).chr(160) => 'S', chr(197).chr(161) => 's',
	chr(197).chr(162) => 'T', chr(197).chr(163) => 't',
	chr(197).chr(164) => 'T', chr(197).chr(165) => 't',
	chr(197).chr(166) => 'T', chr(197).chr(167) => 't',
	chr(197).chr(168) => 'U', chr(197).chr(169) => 'u',
	chr(197).chr(170) => 'U', chr(197).chr(171) => 'u',
	chr(197).chr(172) => 'U', chr(197).chr(173) => 'u',
	chr(197).chr(174) => 'U', chr(197).chr(175) => 'u',
	chr(197).chr(176) => 'U', chr(197).chr(177) => 'u',
	chr(197).chr(178) => 'U', chr(197).chr(179) => 'u',
	chr(197).chr(180) => 'W', chr(197).chr(181) => 'w',
	chr(197).chr(182) => 'Y', chr(197).chr(183) => 'y',
	chr(197).chr(184) => 'Y', chr(197).chr(185) => 'Z',
	chr(197).chr(186) => 'z', chr(197).chr(187) => 'Z',
	chr(197).chr(188) => 'z', chr(197).chr(189) => 'Z',
	chr(197).chr(190) => 'z', chr(197).chr(191) => 's',
	// Euro Sign
	chr(226).chr(130).chr(172) => 'E',
	// GBP (Pound) Sign
	chr(194).chr(163) => '',
	'Ä' => 'Ae', 'ä' => 'ae', 'Ü' => 'Ue', 'ü' => 'ue',
	'Ö' => 'Oe', 'ö' => 'oe', 'ß' => 'ss',
	// Norwegian characters
	'Å'=>'Aa','Æ'=>'Ae','Ø'=>'O','æ'=>'a','ø'=>'o','å'=>'aa'
	);

	$string = strtr($string, $chars);
	} else {
	// Assume ISO-8859-1 if not UTF-8
	$chars['in'] = chr(128).chr(131).chr(138).chr(142).chr(154).chr(158)
	.chr(159).chr(162).chr(165).chr(181).chr(192).chr(193).chr(194)
	.chr(195).chr(196).chr(197).chr(199).chr(200).chr(201).chr(202)
	.chr(203).chr(204).chr(205).chr(206).chr(207).chr(209).chr(210)
	.chr(211).chr(212).chr(213).chr(214).chr(216).chr(217).chr(218)
	.chr(219).chr(220).chr(221).chr(224).chr(225).chr(226).chr(227)
	.chr(228).chr(229).chr(231).chr(232).chr(233).chr(234).chr(235)
	.chr(236).chr(237).chr(238).chr(239).chr(241).chr(242).chr(243)
	.chr(244).chr(245).chr(246).chr(248).chr(249).chr(250).chr(251)
	.chr(252).chr(253).chr(255);

	$chars['out'] = "EfSZszYcYuAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy";

	$string = strtr($string, $chars['in'], $chars['out']);
	$doubleChars['in'] = array(chr(140), chr(156), chr(198), chr(208), chr(222), chr(223), chr(230), chr(240), chr(254));
	$doubleChars['out'] = array('OE', 'oe', 'AE', 'DH', 'TH', 'ss', 'ae', 'dh', 'th');
	$string = str_replace($doubleChars['in'], $doubleChars['out'], $string);
	}

	return $string;
	}

	/**
	* US-ASCII transliterations of Unicode text
	* Ported Sean M. Burke's Text::Unidecode Perl module (He did all the hard work!)
	* Warning: you should only pass this well formed UTF-8!
	* Be aware it works by making a copy of the input string which it appends transliterated
	* characters to - it uses a PHP output buffer to do this - it means, memory use will increase,
	* requiring up to the same amount again as the input string
	*
	* @see http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm
	* @param string UTF-8 string to convert
	* @author <[email protected]>
	* @param string (default = ?) Character use if character unknown
	* @return string US-ASCII string
	*/
	public static function utf8ToAscii($str, $unknown = '?')
	{
	static $UTF8_TO_ASCII;

	if (strlen($str) == 0) {
	return;
	}

	preg_match_all('/.{1}\|[^\x00]{1,1}$/us', $str, $ar);
	$chars = $ar[0];

	foreach ($chars as $i => $c) {
	$ud = 0;
	if (ord($c{0})>=0 && ord($c{0})<=127) { continue; } // ASCII - next please
	if (ord($c{0})>=192 && ord($c{0})<=223) { $ord = (ord($c{0})-192)*64 + (ord($c{1})-128); }
	if (ord($c{0})>=224 && ord($c{0})<=239) { $ord = (ord($c{0})-224)4096 + (ord($c{1})-128)64 + (ord($c{2})-128); }
	if (ord($c{0})>=240 && ord($c{0})<=247) { $ord = (ord($c{0})-240)262144 + (ord($c{1})-128)4096 + (ord($c{2})-128)*64 + (ord($c{3})-128); }
	if (ord($c{0})>=248 && ord($c{0})<=251) { $ord = (ord($c{0})-248)16777216 + (ord($c{1})-128)262144 + (ord($c{2})-128)4096 + (ord($c{3})-128)64 + (ord($c{4})-128); }
	if (ord($c{0})>=252 && ord($c{0})<=253) { $ord = (ord($c{0})-252)1073741824 + (ord($c{1})-128)16777216 + (ord($c{2})-128)262144 + (ord($c{3})-128)4096 + (ord($c{4})-128)*64 + (ord($c{5})-128); }
	if (ord($c{0})>=254 && ord($c{0})<=255) { $chars{$i} = $unknown; continue; } //error

	$bank = $ord >> 8;

	if (!array_key_exists($bank, (array)$UTF8_TO_ASCII)) {
	$bankfile = __DIR__. '/data/'. sprintf("x%02x",$bank).'.php';
	if (file_exists($bankfile)) {
	include $bankfile;
	} else {
	$UTF8_TO_ASCII[$bank] = array();
	}
	}

	$newchar = $ord & 255;
	if (array_key_exists($newchar, $UTF8_TO_ASCII[$bank])) {
	$chars{$i} = $UTF8_TO_ASCII[$bank][$newchar];
	} else {
	$chars{$i} = $unknown;
	}
	}

	return implode('', $chars);
	}

	/**
	* Does not transliterate correctly eastern languages
	*
	* @param string $text
	* @param string $separator
	* @return string
	*/
	public static function urlize($text, $separator = '-')
	{
	$text = self::unaccent($text);
	return self::postProcessText($text, $separator);
	}

	/**
	* Uses transliteration tables to convert any kind of utf8 character
	*
	* @param string $text
	* @param string $separator
	* @return string $text
	*/
	public static function transliterate($text, $separator = '-')
	{
	if (preg_match('/[\x80-\xff]/', $text) && self::validUtf8($text)) {
	$text = self::utf8ToAscii($text);
	}
	return $text;
	}

	/**
	* Tests a string as to whether it's valid UTF-8 and supported by the
	* Unicode standard
	* Note: this function has been modified to simple return true or false
	* @author <[email protected]>
	* @param string UTF-8 encoded string
	* @return boolean true if valid
	* @see http://hsivonen.iki.fi/php-utf8/
	*/
	public static function validUtf8($str)
	{
	$mState = 0; // cached expected number of octets after the current octet
	// until the beginning of the next UTF8 character sequence
	$mUcs4 = 0; // cached Unicode character
	$mBytes = 1; // cached expected number of octets in the current sequence

	$len = strlen($str);
	for ($i = 0; $i < $len; $i++) {
	$in = ord($str{$i});
	if ($mState == 0) {
	// When mState is zero we expect either a US-ASCII character or a
	// multi-octet sequence.
	if (0 == (0x80 & ($in))) {
	// US-ASCII, pass straight through.
	$mBytes = 1;
	} elseif (0xC0 == (0xE0 & ($in))) {
	// First octet of 2 octet sequence
	$mUcs4 = ($in);
	$mUcs4 = ($mUcs4 & 0x1F) << 6;
	$mState = 1;
	$mBytes = 2;
	} elseif (0xE0 == (0xF0 & ($in))) {
	// First octet of 3 octet sequence
	$mUcs4 = ($in);
	$mUcs4 = ($mUcs4 & 0x0F) << 12;
	$mState = 2;
	$mBytes = 3;
	} elseif (0xF0 == (0xF8 & ($in))) {
	// First octet of 4 octet sequence
	$mUcs4 = ($in);
	$mUcs4 = ($mUcs4 & 0x07) << 18;
	$mState = 3;
	$mBytes = 4;
	} elseif (0xF8 == (0xFC & ($in))) {
	/* First octet of 5 octet sequence.
	*
	* This is illegal because the encoded codepoint must be either
	* (a) not the shortest form or
	* (b) outside the Unicode range of 0-0x10FFFF.
	* Rather than trying to resynchronize, we will carry on until the end
	* of the sequence and let the later error handling code catch it.
	*/
	$mUcs4 = ($in);
	$mUcs4 = ($mUcs4 & 0x03) << 24;
	$mState = 4;
	$mBytes = 5;
	} elseif (0xFC == (0xFE & ($in))) {
	// First octet of 6 octet sequence, see comments for 5 octet sequence.
	$mUcs4 = ($in);
	$mUcs4 = ($mUcs4 & 1) << 30;
	$mState = 5;
	$mBytes = 6;
	} else {
	/* Current octet is neither in the US-ASCII range nor a legal first
	* octet of a multi-octet sequence.
	*/
	return false;
	}
	} else {
	// When mState is non-zero, we expect a continuation of the multi-octet
	// sequence
	if (0x80 == (0xC0 & ($in))) {
	// Legal continuation.
	$shift = ($mState - 1) * 6;
	$tmp = $in;
	$tmp = ($tmp & 0x0000003F) << $shift;
	$mUcs4 \|= $tmp;
	/**
	* End of the multi-octet sequence. mUcs4 now contains the final
	* Unicode codepoint to be output
	*/
	if (0 == --$mState) {
	/*
	* Check for illegal sequences and codepoints.
	*/
	// From Unicode 3.1, non-shortest form is illegal
	if (((2 == $mBytes) && ($mUcs4 < 0x0080)) \|\|
	((3 == $mBytes) && ($mUcs4 < 0x0800)) \|\|
	((4 == $mBytes) && ($mUcs4 < 0x10000)) \|\|
	(4 < $mBytes) \|\|
	// From Unicode 3.2, surrogate characters are illegal
	(($mUcs4 & 0xFFFFF800) == 0xD800) \|\|
	// Codepoints outside the Unicode range are illegal
	($mUcs4 > 0x10FFFF)
	) {
	return false;
	}
	//initialize UTF8 cache
	$mState = 0;
	$mUcs4 = 0;
	$mBytes = 1;
	}
	} else {
	/**
	((0xC0 & (in) != 0x80) && (mState != 0))
	* Incomplete multi-octet sequence.
	*/
	return false;
	}
	}
	}
	return true;
	}

	/**
	* Cleans up the text and adds separator
	*
	* @param string $text
	* @param string $separator
	* @return string
	*/
	private static function postProcessText($text, $separator)
	{
	if (function_exists('mb_strtolower')) {
	$text = mb_strtolower($text);
	} else {
	$text = strtolower($text);
	}

	// Remove all none word characters
	$text = preg_replace('/\W/', ' ', $text);

	// More stripping. Replace spaces with dashes
	$text = strtolower(preg_replace('/[^A-Z^a-z^0-9^\/]+/', $separator,
	preg_replace('/([a-z\d])([A-Z])/', '\1_\2',
	preg_replace('/([A-Z]+)([A-Z][a-z])/', '\1_\2',
	preg_replace('/::/', '/', $text)))));

	return trim($text, $separator);
	}
	}
	<?php

	$start = microtime (true);

	require 'Urlizer.php';

	use Gedmo\Sluggable\Util\Urlizer;

	echo Urlizer::urlize (' J\'étudie le français ') . PHP_EOL;
	echo Urlizer::urlize ('Lo siento, no hablo español.') . PHP_EOL;
	echo Urlizer::urlize ('ΦΞΠΏΣ') . PHP_EOL;

	echo microtime (true) - $start . PHP_EOL;
	echo memory_get_peak_usage () . PHP_EOL;

	?>