Created
July 12, 2012 15:52
-
-
Save sgmurphy/3098978 to your computer and use it in GitHub Desktop.
URL Slugs in PHP (with UTF-8 and Transliteration Support)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Create a web friendly URL slug from a string. | |
* | |
* Although supported, transliteration is discouraged because | |
* 1) most web browsers support UTF-8 characters in URLs | |
* 2) transliteration causes a loss of information | |
* | |
* @author Sean Murphy <[email protected]> | |
* @copyright Copyright 2012 Sean Murphy. All rights reserved. | |
* @license http://creativecommons.org/publicdomain/zero/1.0/ | |
* | |
* @param string $str | |
* @param array $options | |
* @return string | |
*/ | |
function url_slug($str, $options = array()) { | |
// Make sure string is in UTF-8 and strip invalid UTF-8 characters | |
$str = mb_convert_encoding((string)$str, 'UTF-8', mb_list_encodings()); | |
$defaults = array( | |
'delimiter' => '-', | |
'limit' => null, | |
'lowercase' => true, | |
'replacements' => array(), | |
'transliterate' => false, | |
); | |
// Merge options | |
$options = array_merge($defaults, $options); | |
$char_map = array( | |
// Latin | |
'À' => 'A', 'Á' => 'A', 'Â' => 'A', 'Ã' => 'A', 'Ä' => 'A', 'Å' => 'A', 'Æ' => 'AE', 'Ç' => 'C', | |
'È' => 'E', 'É' => 'E', 'Ê' => 'E', 'Ë' => 'E', 'Ì' => 'I', 'Í' => 'I', 'Î' => 'I', 'Ï' => 'I', | |
'Ð' => 'D', 'Ñ' => 'N', 'Ò' => 'O', 'Ó' => 'O', 'Ô' => 'O', 'Õ' => 'O', 'Ö' => 'O', 'Ő' => 'O', | |
'Ø' => 'O', 'Ù' => 'U', 'Ú' => 'U', 'Û' => 'U', 'Ü' => 'U', 'Ű' => 'U', 'Ý' => 'Y', 'Þ' => 'TH', | |
'ß' => 'ss', | |
'à' => 'a', 'á' => 'a', 'â' => 'a', 'ã' => 'a', 'ä' => 'a', 'å' => 'a', 'æ' => 'ae', 'ç' => 'c', | |
'è' => 'e', 'é' => 'e', 'ê' => 'e', 'ë' => 'e', 'ì' => 'i', 'í' => 'i', 'î' => 'i', 'ï' => 'i', | |
'ð' => 'd', 'ñ' => 'n', 'ò' => 'o', 'ó' => 'o', 'ô' => 'o', 'õ' => 'o', 'ö' => 'o', 'ő' => 'o', | |
'ø' => 'o', 'ù' => 'u', 'ú' => 'u', 'û' => 'u', 'ü' => 'u', 'ű' => 'u', 'ý' => 'y', 'þ' => 'th', | |
'ÿ' => 'y', | |
// Latin symbols | |
'©' => '(c)', | |
// Greek | |
'Α' => 'A', 'Β' => 'B', 'Γ' => 'G', 'Δ' => 'D', 'Ε' => 'E', 'Ζ' => 'Z', 'Η' => 'H', 'Θ' => '8', | |
'Ι' => 'I', 'Κ' => 'K', 'Λ' => 'L', 'Μ' => 'M', 'Ν' => 'N', 'Ξ' => '3', 'Ο' => 'O', 'Π' => 'P', | |
'Ρ' => 'R', 'Σ' => 'S', 'Τ' => 'T', 'Υ' => 'Y', 'Φ' => 'F', 'Χ' => 'X', 'Ψ' => 'PS', 'Ω' => 'W', | |
'Ά' => 'A', 'Έ' => 'E', 'Ί' => 'I', 'Ό' => 'O', 'Ύ' => 'Y', 'Ή' => 'H', 'Ώ' => 'W', 'Ϊ' => 'I', | |
'Ϋ' => 'Y', | |
'α' => 'a', 'β' => 'b', 'γ' => 'g', 'δ' => 'd', 'ε' => 'e', 'ζ' => 'z', 'η' => 'h', 'θ' => '8', | |
'ι' => 'i', 'κ' => 'k', 'λ' => 'l', 'μ' => 'm', 'ν' => 'n', 'ξ' => '3', 'ο' => 'o', 'π' => 'p', | |
'ρ' => 'r', 'σ' => 's', 'τ' => 't', 'υ' => 'y', 'φ' => 'f', 'χ' => 'x', 'ψ' => 'ps', 'ω' => 'w', | |
'ά' => 'a', 'έ' => 'e', 'ί' => 'i', 'ό' => 'o', 'ύ' => 'y', 'ή' => 'h', 'ώ' => 'w', 'ς' => 's', | |
'ϊ' => 'i', 'ΰ' => 'y', 'ϋ' => 'y', 'ΐ' => 'i', | |
// Turkish | |
'Ş' => 'S', 'İ' => 'I', 'Ç' => 'C', 'Ü' => 'U', 'Ö' => 'O', 'Ğ' => 'G', | |
'ş' => 's', 'ı' => 'i', 'ç' => 'c', 'ü' => 'u', 'ö' => 'o', 'ğ' => 'g', | |
// Russian | |
'А' => 'A', 'Б' => 'B', 'В' => 'V', 'Г' => 'G', 'Д' => 'D', 'Е' => 'E', 'Ё' => 'Yo', 'Ж' => 'Zh', | |
'З' => 'Z', 'И' => 'I', 'Й' => 'J', 'К' => 'K', 'Л' => 'L', 'М' => 'M', 'Н' => 'N', 'О' => 'O', | |
'П' => 'P', 'Р' => 'R', 'С' => 'S', 'Т' => 'T', 'У' => 'U', 'Ф' => 'F', 'Х' => 'H', 'Ц' => 'C', | |
'Ч' => 'Ch', 'Ш' => 'Sh', 'Щ' => 'Sh', 'Ъ' => '', 'Ы' => 'Y', 'Ь' => '', 'Э' => 'E', 'Ю' => 'Yu', | |
'Я' => 'Ya', | |
'а' => 'a', 'б' => 'b', 'в' => 'v', 'г' => 'g', 'д' => 'd', 'е' => 'e', 'ё' => 'yo', 'ж' => 'zh', | |
'з' => 'z', 'и' => 'i', 'й' => 'j', 'к' => 'k', 'л' => 'l', 'м' => 'm', 'н' => 'n', 'о' => 'o', | |
'п' => 'p', 'р' => 'r', 'с' => 's', 'т' => 't', 'у' => 'u', 'ф' => 'f', 'х' => 'h', 'ц' => 'c', | |
'ч' => 'ch', 'ш' => 'sh', 'щ' => 'sh', 'ъ' => '', 'ы' => 'y', 'ь' => '', 'э' => 'e', 'ю' => 'yu', | |
'я' => 'ya', | |
// Ukrainian | |
'Є' => 'Ye', 'І' => 'I', 'Ї' => 'Yi', 'Ґ' => 'G', | |
'є' => 'ye', 'і' => 'i', 'ї' => 'yi', 'ґ' => 'g', | |
// Czech | |
'Č' => 'C', 'Ď' => 'D', 'Ě' => 'E', 'Ň' => 'N', 'Ř' => 'R', 'Š' => 'S', 'Ť' => 'T', 'Ů' => 'U', | |
'Ž' => 'Z', | |
'č' => 'c', 'ď' => 'd', 'ě' => 'e', 'ň' => 'n', 'ř' => 'r', 'š' => 's', 'ť' => 't', 'ů' => 'u', | |
'ž' => 'z', | |
// Polish | |
'Ą' => 'A', 'Ć' => 'C', 'Ę' => 'e', 'Ł' => 'L', 'Ń' => 'N', 'Ó' => 'o', 'Ś' => 'S', 'Ź' => 'Z', | |
'Ż' => 'Z', | |
'ą' => 'a', 'ć' => 'c', 'ę' => 'e', 'ł' => 'l', 'ń' => 'n', 'ó' => 'o', 'ś' => 's', 'ź' => 'z', | |
'ż' => 'z', | |
// Latvian | |
'Ā' => 'A', 'Č' => 'C', 'Ē' => 'E', 'Ģ' => 'G', 'Ī' => 'i', 'Ķ' => 'k', 'Ļ' => 'L', 'Ņ' => 'N', | |
'Š' => 'S', 'Ū' => 'u', 'Ž' => 'Z', | |
'ā' => 'a', 'č' => 'c', 'ē' => 'e', 'ģ' => 'g', 'ī' => 'i', 'ķ' => 'k', 'ļ' => 'l', 'ņ' => 'n', | |
'š' => 's', 'ū' => 'u', 'ž' => 'z' | |
); | |
// Make custom replacements | |
$str = preg_replace(array_keys($options['replacements']), $options['replacements'], $str); | |
// Transliterate characters to ASCII | |
if ($options['transliterate']) { | |
$str = str_replace(array_keys($char_map), $char_map, $str); | |
} | |
// Replace non-alphanumeric characters with our delimiter | |
$str = preg_replace('/[^\p{L}\p{Nd}]+/u', $options['delimiter'], $str); | |
// Remove duplicate delimiters | |
$str = preg_replace('/(' . preg_quote($options['delimiter'], '/') . '){2,}/', '$1', $str); | |
// Truncate slug to max. characters | |
$str = mb_substr($str, 0, ($options['limit'] ? $options['limit'] : mb_strlen($str, 'UTF-8')), 'UTF-8'); | |
// Remove delimiter from ends | |
$str = trim($str, $options['delimiter']); | |
return $options['lowercase'] ? mb_strtolower($str, 'UTF-8') : $str; | |
} | |
?> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
include('url_slug.php'); | |
header('Content-type: text/plain; charset=utf-8'); | |
// Basic usage | |
echo "This is an example string. Nothing fancy." . "\n"; | |
echo url_slug("This is an example string. Nothing fancy.") . "\n\n"; | |
// Example using French with unwanted characters ('?) | |
echo "Qu'en est-il français? Ça marche alors?" . "\n"; | |
echo url_slug("Qu'en est-il français? Ça marche alors?") . "\n\n"; | |
// Example using transliteration | |
echo "Что делать, если я не хочу, UTF-8?" . "\n"; | |
echo url_slug("Что делать, если я не хочу, UTF-8?", array('transliterate' => true)) . "\n\n"; | |
// Example using transliteration on an unsupported language | |
echo "מה אם אני לא רוצה UTF-8 תווים?" . "\n"; | |
echo url_slug("מה אם אני לא רוצה UTF-8 תווים?", array('transliterate' => true)) . "\n\n"; | |
// Some other options | |
echo "This is an Example String. What's Going to Happen to Me?" . "\n"; | |
echo url_slug( | |
"This is an Example String. What's Going to Happen to Me?", | |
array( | |
'delimiter' => '_', | |
'limit' => 40, | |
'lowercase' => false, | |
'replacements' => array( | |
'/\b(an)\b/i' => 'a', | |
'/\b(example)\b/i' => 'Test' | |
) | |
) | |
); | |
/* | |
Output: | |
This is an example string. Nothing fancy. | |
this-is-an-example-string-nothing-fancy | |
Qu'en est-il français? Ça marche alors? | |
qu-en-est-il-français-ça-marche-alors | |
Что делать, если я не хочу, UTF-8? | |
chto-delat-esli-ya-ne-hochu-utf-8 | |
מה אם אני לא רוצה UTF-8 תווים? | |
מה-אם-אני-לא-רוצה-utf-8-תווים | |
This is an Example String. What's Going to Happen to Me? | |
This_is_a_Test_String_What_s_Going_to_Ha | |
*/ | |
?> |
I was using this but noticed it does not work for all inputs.
Example:
$str = "Ruka";
function url_slug($str, $options = array()) {
$str = mb_convert_encoding((string)$str, 'UTF-8', mb_list_encodings());
return $str;
}
Returns empty str
Looks like it breaks in PHP 8.1 and above.
For PHP 8.1 and above, you need to change line 19 to
$str = mb_convert_encoding((string)$str, 'UTF-8');
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Cool, I like how it keeps most UTF-8 characters so it works internationally. I want to remind everyone reading this that UTF-8 characters aren't allowed in HTTP message headers (Body is fine of course), which includes the HTTP method and DOCUMENT PATH line. So, when sending an HTTP message with a URL path that contains non-ASCII special UTF-8 characters, or when doing a
Location:
redirect header, please be sure to properly URL encode the data.Example:
That looks like a typical GET request. What's wrong? The
GET
line includes non-ASCII characters:Что-делать.html
. The correct Request should be:Though in general, I would recommend only using plain ASCII in URLs for maximum compatibility, but sometimes it cannot be avoided.
For HTTP responses, be sure to also use plain ASCII only:
Notice the
Location:
header on the Response does not include any UTF-8 characters.Some browsers and systems will fudge it and allow this, but others will be strict and deny accepting the Request or Response.