-
-
Save tallesairan/6f169e567f42b3ba15af4de001a3e26b to your computer and use it in GitHub Desktop.
URL Slugs in PHP (with UTF-8 and Transliteration Support)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Create a web friendly URL slug from a string. | |
* | |
* Although supported, transliteration is discouraged because | |
* 1) most web browsers support UTF-8 characters in URLs | |
* 2) transliteration causes a loss of information | |
* | |
* @author Sean Murphy <[email protected]> | |
* @copyright Copyright 2012 Sean Murphy. All rights reserved. | |
* @license http://creativecommons.org/publicdomain/zero/1.0/ | |
* | |
* @param string $str | |
* @param array $options | |
* @return string | |
*/ | |
function url_slug($str, $options = array()) { | |
// Make sure string is in UTF-8 and strip invalid UTF-8 characters | |
$str = mb_convert_encoding((string)$str, 'UTF-8', mb_list_encodings()); | |
$defaults = array( | |
'delimiter' => '-', | |
'limit' => null, | |
'lowercase' => true, | |
'replacements' => array(), | |
'transliterate' => false, | |
); | |
// Merge options | |
$options = array_merge($defaults, $options); | |
$char_map = array( | |
// Latin | |
'À' => 'A', 'Á' => 'A', 'Â' => 'A', 'Ã' => 'A', 'Ä' => 'A', 'Å' => 'A', 'Æ' => 'AE', 'Ç' => 'C', | |
'È' => 'E', 'É' => 'E', 'Ê' => 'E', 'Ë' => 'E', 'Ì' => 'I', 'Í' => 'I', 'Î' => 'I', 'Ï' => 'I', | |
'Ð' => 'D', 'Ñ' => 'N', 'Ò' => 'O', 'Ó' => 'O', 'Ô' => 'O', 'Õ' => 'O', 'Ö' => 'O', 'Ő' => 'O', | |
'Ø' => 'O', 'Ù' => 'U', 'Ú' => 'U', 'Û' => 'U', 'Ü' => 'U', 'Ű' => 'U', 'Ý' => 'Y', 'Þ' => 'TH', | |
'ß' => 'ss', | |
'à' => 'a', 'á' => 'a', 'â' => 'a', 'ã' => 'a', 'ä' => 'a', 'å' => 'a', 'æ' => 'ae', 'ç' => 'c', | |
'è' => 'e', 'é' => 'e', 'ê' => 'e', 'ë' => 'e', 'ì' => 'i', 'í' => 'i', 'î' => 'i', 'ï' => 'i', | |
'ð' => 'd', 'ñ' => 'n', 'ò' => 'o', 'ó' => 'o', 'ô' => 'o', 'õ' => 'o', 'ö' => 'o', 'ő' => 'o', | |
'ø' => 'o', 'ù' => 'u', 'ú' => 'u', 'û' => 'u', 'ü' => 'u', 'ű' => 'u', 'ý' => 'y', 'þ' => 'th', | |
'ÿ' => 'y', | |
// Latin symbols | |
'©' => '(c)', | |
// Greek | |
'Α' => 'A', 'Β' => 'B', 'Γ' => 'G', 'Δ' => 'D', 'Ε' => 'E', 'Ζ' => 'Z', 'Η' => 'H', 'Θ' => '8', | |
'Ι' => 'I', 'Κ' => 'K', 'Λ' => 'L', 'Μ' => 'M', 'Ν' => 'N', 'Ξ' => '3', 'Ο' => 'O', 'Π' => 'P', | |
'Ρ' => 'R', 'Σ' => 'S', 'Τ' => 'T', 'Υ' => 'Y', 'Φ' => 'F', 'Χ' => 'X', 'Ψ' => 'PS', 'Ω' => 'W', | |
'Ά' => 'A', 'Έ' => 'E', 'Ί' => 'I', 'Ό' => 'O', 'Ύ' => 'Y', 'Ή' => 'H', 'Ώ' => 'W', 'Ϊ' => 'I', | |
'Ϋ' => 'Y', | |
'α' => 'a', 'β' => 'b', 'γ' => 'g', 'δ' => 'd', 'ε' => 'e', 'ζ' => 'z', 'η' => 'h', 'θ' => '8', | |
'ι' => 'i', 'κ' => 'k', 'λ' => 'l', 'μ' => 'm', 'ν' => 'n', 'ξ' => '3', 'ο' => 'o', 'π' => 'p', | |
'ρ' => 'r', 'σ' => 's', 'τ' => 't', 'υ' => 'y', 'φ' => 'f', 'χ' => 'x', 'ψ' => 'ps', 'ω' => 'w', | |
'ά' => 'a', 'έ' => 'e', 'ί' => 'i', 'ό' => 'o', 'ύ' => 'y', 'ή' => 'h', 'ώ' => 'w', 'ς' => 's', | |
'ϊ' => 'i', 'ΰ' => 'y', 'ϋ' => 'y', 'ΐ' => 'i', | |
// Turkish | |
'Ş' => 'S', 'İ' => 'I', 'Ç' => 'C', 'Ü' => 'U', 'Ö' => 'O', 'Ğ' => 'G', | |
'ş' => 's', 'ı' => 'i', 'ç' => 'c', 'ü' => 'u', 'ö' => 'o', 'ğ' => 'g', | |
// Russian | |
'А' => 'A', 'Б' => 'B', 'В' => 'V', 'Г' => 'G', 'Д' => 'D', 'Е' => 'E', 'Ё' => 'Yo', 'Ж' => 'Zh', | |
'З' => 'Z', 'И' => 'I', 'Й' => 'J', 'К' => 'K', 'Л' => 'L', 'М' => 'M', 'Н' => 'N', 'О' => 'O', | |
'П' => 'P', 'Р' => 'R', 'С' => 'S', 'Т' => 'T', 'У' => 'U', 'Ф' => 'F', 'Х' => 'H', 'Ц' => 'C', | |
'Ч' => 'Ch', 'Ш' => 'Sh', 'Щ' => 'Sh', 'Ъ' => '', 'Ы' => 'Y', 'Ь' => '', 'Э' => 'E', 'Ю' => 'Yu', | |
'Я' => 'Ya', | |
'а' => 'a', 'б' => 'b', 'в' => 'v', 'г' => 'g', 'д' => 'd', 'е' => 'e', 'ё' => 'yo', 'ж' => 'zh', | |
'з' => 'z', 'и' => 'i', 'й' => 'j', 'к' => 'k', 'л' => 'l', 'м' => 'm', 'н' => 'n', 'о' => 'o', | |
'п' => 'p', 'р' => 'r', 'с' => 's', 'т' => 't', 'у' => 'u', 'ф' => 'f', 'х' => 'h', 'ц' => 'c', | |
'ч' => 'ch', 'ш' => 'sh', 'щ' => 'sh', 'ъ' => '', 'ы' => 'y', 'ь' => '', 'э' => 'e', 'ю' => 'yu', | |
'я' => 'ya', | |
// Ukrainian | |
'Є' => 'Ye', 'І' => 'I', 'Ї' => 'Yi', 'Ґ' => 'G', | |
'є' => 'ye', 'і' => 'i', 'ї' => 'yi', 'ґ' => 'g', | |
// Czech | |
'Č' => 'C', 'Ď' => 'D', 'Ě' => 'E', 'Ň' => 'N', 'Ř' => 'R', 'Š' => 'S', 'Ť' => 'T', 'Ů' => 'U', | |
'Ž' => 'Z', | |
'č' => 'c', 'ď' => 'd', 'ě' => 'e', 'ň' => 'n', 'ř' => 'r', 'š' => 's', 'ť' => 't', 'ů' => 'u', | |
'ž' => 'z', | |
// Polish | |
'Ą' => 'A', 'Ć' => 'C', 'Ę' => 'e', 'Ł' => 'L', 'Ń' => 'N', 'Ó' => 'o', 'Ś' => 'S', 'Ź' => 'Z', | |
'Ż' => 'Z', | |
'ą' => 'a', 'ć' => 'c', 'ę' => 'e', 'ł' => 'l', 'ń' => 'n', 'ó' => 'o', 'ś' => 's', 'ź' => 'z', | |
'ż' => 'z', | |
// Latvian | |
'Ā' => 'A', 'Č' => 'C', 'Ē' => 'E', 'Ģ' => 'G', 'Ī' => 'i', 'Ķ' => 'k', 'Ļ' => 'L', 'Ņ' => 'N', | |
'Š' => 'S', 'Ū' => 'u', 'Ž' => 'Z', | |
'ā' => 'a', 'č' => 'c', 'ē' => 'e', 'ģ' => 'g', 'ī' => 'i', 'ķ' => 'k', 'ļ' => 'l', 'ņ' => 'n', | |
'š' => 's', 'ū' => 'u', 'ž' => 'z' | |
); | |
// Make custom replacements | |
$str = preg_replace(array_keys($options['replacements']), $options['replacements'], $str); | |
// Transliterate characters to ASCII | |
if ($options['transliterate']) { | |
$str = str_replace(array_keys($char_map), $char_map, $str); | |
} | |
// Replace non-alphanumeric characters with our delimiter | |
$str = preg_replace('/[^\p{L}\p{Nd}]+/u', $options['delimiter'], $str); | |
// Remove duplicate delimiters | |
$str = preg_replace('/(' . preg_quote($options['delimiter'], '/') . '){2,}/', '$1', $str); | |
// Truncate slug to max. characters | |
$str = mb_substr($str, 0, ($options['limit'] ? $options['limit'] : mb_strlen($str, 'UTF-8')), 'UTF-8'); | |
// Remove delimiter from ends | |
$str = trim($str, $options['delimiter']); | |
return $options['lowercase'] ? mb_strtolower($str, 'UTF-8') : $str; | |
} | |
?> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
include('url_slug.php'); | |
header('Content-type: text/plain; charset=utf-8'); | |
// Basic usage | |
echo "This is an example string. Nothing fancy." . "\n"; | |
echo url_slug("This is an example string. Nothing fancy.") . "\n\n"; | |
// Example using French with unwanted characters ('?) | |
echo "Qu'en est-il français? Ça marche alors?" . "\n"; | |
echo url_slug("Qu'en est-il français? Ça marche alors?") . "\n\n"; | |
// Example using transliteration | |
echo "Что делать, если я не хочу, UTF-8?" . "\n"; | |
echo url_slug("Что делать, если я не хочу, UTF-8?", array('transliterate' => true)) . "\n\n"; | |
// Example using transliteration on an unsupported language | |
echo "מה אם אני לא רוצה UTF-8 תווים?" . "\n"; | |
echo url_slug("מה אם אני לא רוצה UTF-8 תווים?", array('transliterate' => true)) . "\n\n"; | |
// Some other options | |
echo "This is an Example String. What's Going to Happen to Me?" . "\n"; | |
echo url_slug( | |
"This is an Example String. What's Going to Happen to Me?", | |
array( | |
'delimiter' => '_', | |
'limit' => 40, | |
'lowercase' => false, | |
'replacements' => array( | |
'/\b(an)\b/i' => 'a', | |
'/\b(example)\b/i' => 'Test' | |
) | |
) | |
); | |
/* | |
Output: | |
This is an example string. Nothing fancy. | |
this-is-an-example-string-nothing-fancy | |
Qu'en est-il français? Ça marche alors? | |
qu-en-est-il-français-ça-marche-alors | |
Что делать, если я не хочу, UTF-8? | |
chto-delat-esli-ya-ne-hochu-utf-8 | |
מה אם אני לא רוצה UTF-8 תווים? | |
מה-אם-אני-לא-רוצה-utf-8-תווים | |
This is an Example String. What's Going to Happen to Me? | |
This_is_a_Test_String_What_s_Going_to_Ha | |
*/ | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment