Last active
August 29, 2015 14:13
-
-
Save PEKTOP/f22d21aac6f5815df80b to your computer and use it in GitHub Desktop.
Slug
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class Slug | |
{ | |
/** | |
* @param string $string | |
* @return string | |
*/ | |
public function make($string) | |
{ | |
if (is_string($string)) { | |
return $this->sanitize($this->ruToLat($string)); | |
} | |
return null; | |
} | |
/** | |
* @param string $str | |
* @param string $schema | |
* @return string | |
*/ | |
protected function ruToLat($str, $schema = 'gost') | |
{ | |
$str = mb_strtolower($str, 'UTF-8'); | |
$map = [ | |
'passport' => [ | |
'а' => 'a', | |
'б' => 'b', | |
'в' => 'v', | |
'г' => 'g', | |
'д' => 'd', | |
'е' => 'e', | |
'ё' => 'e', | |
'ж' => 'zh', | |
'з' => 'z', | |
'и' => 'i', | |
'й' => 'i', | |
'к' => 'k', | |
'л' => 'l', | |
'м' => 'm', | |
'н' => 'n', | |
'о' => 'o', | |
'п' => 'p', | |
'р' => 'r', | |
'с' => 's', | |
'т' => 't', | |
'у' => 'u', | |
'ф' => 'f', | |
'х' => 'kh', | |
'ц' => 'tc', | |
'ч' => 'ch', | |
'ш' => 'sh', | |
'щ' => 'shch', | |
'ъ' => '', | |
'ь' => '', | |
'ы' => 'y', | |
'э' => 'e', | |
'ю' => 'iu', | |
'я' => 'ia', | |
], | |
'gost' => [ | |
'а' => 'a', | |
'б' => 'b', | |
'в' => 'v', | |
'г' => 'g', | |
'д' => 'd', | |
'е' => 'e', | |
'ё' => 'yo', | |
'ж' => 'zh', | |
'з' => 'z', | |
'и' => 'i', | |
'й' => 'j', | |
'к' => 'k', | |
'л' => 'l', | |
'м' => 'm', | |
'н' => 'n', | |
'о' => 'o', | |
'п' => 'p', | |
'р' => 'r', | |
'с' => 's', | |
'т' => 't', | |
'у' => 'u', | |
'ф' => 'f', | |
'х' => 'x', | |
'ц' => 'c', | |
'ч' => 'ch', | |
'ш' => 'sh', | |
'щ' => 'shh', | |
'ъ' => '', | |
'ь' => '', | |
'ы' => 'y', | |
'э' => 'e', | |
'ю' => 'yu', | |
'я' => 'ya', | |
] | |
]; | |
return strtr($str, $map[$schema]); | |
} | |
/** | |
* Sanitizes a title, replacing whitespace and a few other characters with dashes. | |
* | |
* Limits the output to alphanumeric characters, underscore (_) and dash (-). | |
* Whitespace becomes a dash. | |
* | |
* @param string $title The title to be sanitized. | |
* @return string The sanitized title. | |
*/ | |
protected function sanitize($title) { | |
$title = strip_tags($title); | |
// Preserve escaped octets. | |
$title = preg_replace('|%([a-fA-F0-9][a-fA-F0-9])|', '---$1---', $title); | |
// Remove percent signs that are not part of an octet. | |
$title = str_replace('%', '', $title); | |
// Restore octets. | |
$title = preg_replace('|---([a-fA-F0-9][a-fA-F0-9])---|', '%$1', $title); | |
if ($this->seems_utf8($title)) { | |
if (function_exists('mb_strtolower')) { | |
$title = mb_strtolower($title, 'UTF-8'); | |
} | |
$title = $this->utf8_uri_encode($title, 200); | |
} | |
$title = preg_replace('/&.+?;/', '', $title); // kill entities | |
$title = str_replace('.', '-', $title); | |
// Convert nbsp, ndash and mdash to hyphens | |
$title = str_replace( [ '%c2%a0', '%e2%80%93', '%e2%80%94' ], '-', $title ); | |
// Strip these characters entirely | |
$title = str_replace( [ | |
// iexcl and iquest | |
'%c2%a1', '%c2%bf', | |
// angle quotes | |
'%c2%ab', '%c2%bb', '%e2%80%b9', '%e2%80%ba', | |
// curly quotes | |
'%e2%80%98', '%e2%80%99', '%e2%80%9c', '%e2%80%9d', | |
'%e2%80%9a', '%e2%80%9b', '%e2%80%9e', '%e2%80%9f', | |
// copy, reg, deg, hellip and trade | |
'%c2%a9', '%c2%ae', '%c2%b0', '%e2%80%a6', '%e2%84%a2', | |
// acute accents | |
'%c2%b4', '%cb%8a', '%cc%81', '%cd%81', | |
// grave accent, macron, caron | |
'%cc%80', '%cc%84', '%cc%8c', | |
// № | |
'%e2%84%96', | |
], '', $title ); | |
// Convert times to x | |
$title = str_replace( '%c3%97', 'x', $title ); | |
$title = preg_replace('/[^%a-z0-9 _-]/', '', $title); | |
$title = preg_replace('/\s+/', '-', $title); | |
$title = preg_replace('|-+|', '-', $title); | |
$title = trim($title, '-'); | |
return $title; | |
} | |
/** | |
* Checks to see if a string is utf8 encoded. | |
* | |
* NOTE: This function checks for 5-Byte sequences, UTF8 | |
* has Bytes Sequences with a maximum length of 4. | |
* | |
* @param string $str The string to be checked | |
* @return bool True if $str fits a UTF-8 model, false otherwise. | |
*/ | |
protected function seems_utf8($str) { | |
$length = strlen($str); | |
for ($i=0; $i < $length; $i++) { | |
$c = ord($str[$i]); | |
if ($c < 0x80) $n = 0; # 0bbbbbbb | |
elseif (($c & 0xE0) == 0xC0) $n=1; # 110bbbbb | |
elseif (($c & 0xF0) == 0xE0) $n=2; # 1110bbbb | |
elseif (($c & 0xF8) == 0xF0) $n=3; # 11110bbb | |
elseif (($c & 0xFC) == 0xF8) $n=4; # 111110bb | |
elseif (($c & 0xFE) == 0xFC) $n=5; # 1111110b | |
else return false; # Does not match any model | |
for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? | |
if ((++$i == $length) || ((ord($str[$i]) & 0xC0) != 0x80)) | |
return false; | |
} | |
} | |
return true; | |
} | |
/** | |
* Encode the Unicode values to be used in the URI. | |
* | |
* @param string $utf8_string | |
* @param int $length Max length of the string | |
* @return string String with Unicode encoded for URI. | |
*/ | |
protected function utf8_uri_encode( $utf8_string, $length = 0 ) { | |
$unicode = ''; | |
$values = []; | |
$num_octets = 1; | |
$unicode_length = 0; | |
$string_length = strlen( $utf8_string ); | |
for ($i = 0; $i < $string_length; $i++ ) { | |
$value = ord( $utf8_string[ $i ] ); | |
if ( $value < 128 ) { | |
if ( $length && ( $unicode_length >= $length ) ) | |
break; | |
$unicode .= chr($value); | |
$unicode_length++; | |
} else { | |
if ( count( $values ) == 0 ) $num_octets = ( $value < 224 ) ? 2 : 3; | |
$values[] = $value; | |
if ( $length && ( $unicode_length + ($num_octets * 3) ) > $length ) | |
break; | |
if ( count( $values ) == $num_octets ) { | |
if ($num_octets == 3) { | |
$unicode .= '%' . dechex($values[0]) . '%' . dechex($values[1]) . '%' . dechex($values[2]); | |
$unicode_length += 9; | |
} else { | |
$unicode .= '%' . dechex($values[0]) . '%' . dechex($values[1]); | |
$unicode_length += 6; | |
} | |
$values = []; | |
$num_octets = 1; | |
} | |
} | |
} | |
return $unicode; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment