Last active
November 17, 2016 12:29
-
-
Save dinhquochan/b8e1b111646c74cfbaceb98d20699520 to your computer and use it in GitHub Desktop.
String Helpers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* @package String Helpers | |
* @version 1.0 | |
* @author Dinh Quoc Han <[email protected]> | |
* @link https://henrydinh.com | |
* @license MIT | |
*/ | |
class Str | |
{ | |
/** | |
* Transform Vietnamese to English | |
* | |
* @param string $str Vietnamese String | |
* @return string English string | |
*/ | |
static function vi2en($str) | |
{ | |
$arrVietnamese = [ | |
"à","á","ạ","ả","ã","â","ầ","ấ","ậ","ẩ","ẫ","ă","ằ","ắ","ặ","ẳ","ẵ", | |
"è","é","ẹ","ẻ","ẽ","ê","ề","ế","ệ","ể","ễ", | |
"ì","í","ị","ỉ","ĩ", | |
"ò","ó","ọ","ỏ","õ","ô","ồ","ố","ộ","ổ","ỗ","ơ","ờ","ớ","ợ","ở","ỡ", | |
"ù","ú","ụ","ủ","ũ","ư","ừ","ứ","ự","ử","ữ", | |
"ỳ","ý","ỵ","ỷ","ỹ", | |
"đ", | |
"À","Á","Ạ","Ả","Ã","Â","Ầ","Ấ","Ậ","Ẩ","Ẫ","Ă","Ằ","Ắ","Ặ","Ẳ","Ẵ", | |
"È","É","Ẹ","Ẻ","Ẽ","Ê","Ề","Ế","Ệ","Ể","Ễ", | |
"Ì","Í","Ị","Ỉ","Ĩ", | |
"Ò","Ó","Ọ","Ỏ","Õ","Ô","Ồ","Ố","Ộ","Ổ","Ỗ","Ơ","Ờ","Ớ","Ợ","Ở","Ỡ", | |
"Ù","Ú","Ụ","Ủ","Ũ","Ư","Ừ","Ứ","Ự","Ử","Ữ", | |
"Ỳ","Ý","Ỵ","Ỷ","Ỹ", | |
"Đ" | |
]; | |
$arrEnglish = [ | |
"a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a", | |
"e","e","e","e","e","e","e","e","e","e","e", | |
"i","i","i","i","i", | |
"o","o","o","o","o","o","o","o","o","o","o","o","o","o","o","o","o", | |
"u","u","u","u","u","u","u","u","u","u","u", | |
"y","y","y","y","y", | |
"d", | |
"A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A", | |
"E","E","E","E","E","E","E","E","E","E","E", | |
"I","I","I","I","I", | |
"O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O", | |
"U","U","U","U","U","U","U","U","U","U","U", | |
"Y","Y","Y","Y","Y", | |
"D" | |
]; | |
return str_replace($arrVietnamese, $arrEnglish, $str); | |
} | |
/** | |
* UTF8 URI Encoder | |
* | |
* @param string $utf8_string | |
* @param int $length | |
* @return string utf-8 URI | |
*/ | |
static function utf8_uri_encode( $utf8_string, $length = 0 ) { | |
$unicode = ''; | |
$values = array(); | |
$num_octets = 1; | |
$unicode_length = 0; | |
$string_length = mb_strlen( $utf8_string ); | |
for ($i = 0; $i < $string_length; $i++ ) { | |
$value = ord( $utf8_string[ $i ] ); | |
if ( $value < 128 ) { | |
if ( $length && ( $unicode_length >= $length ) ) | |
break; | |
$unicode .= chr($value); | |
$unicode_length++; | |
} else { | |
if ( count( $values ) == 0 ) { | |
if ( $value < 224 ) { | |
$num_octets = 2; | |
} elseif ( $value < 240 ) { | |
$num_octets = 3; | |
} else { | |
$num_octets = 4; | |
} | |
} | |
$values[] = $value; | |
if ( $length && ( $unicode_length + ($num_octets * 3) ) > $length ) | |
break; | |
if ( count( $values ) == $num_octets ) { | |
for ( $j = 0; $j < $num_octets; $j++ ) { | |
$unicode .= '%' . dechex( $values[ $j ] ); | |
} | |
$unicode_length += $num_octets * 3; | |
$values = array(); | |
$num_octets = 1; | |
} | |
} | |
} | |
return $unicode; | |
} | |
/** | |
* Remove all accents | |
* | |
* @param $string | |
* @return mixed|string | |
*/ | |
static function remove_accents( $string ) { | |
if ( !preg_match('/[\x80-\xff]/', $string) ) | |
return $string; | |
if (static::seems_utf8($string)) { | |
$chars = array( | |
// Decompositions for Latin-1 Supplement | |
chr(194).chr(170) => 'a', chr(194).chr(186) => 'o', | |
chr(195).chr(128) => 'A', chr(195).chr(129) => 'A', | |
chr(195).chr(130) => 'A', chr(195).chr(131) => 'A', | |
chr(195).chr(132) => 'A', chr(195).chr(133) => 'A', | |
chr(195).chr(134) => 'AE',chr(195).chr(135) => 'C', | |
chr(195).chr(136) => 'E', chr(195).chr(137) => 'E', | |
chr(195).chr(138) => 'E', chr(195).chr(139) => 'E', | |
chr(195).chr(140) => 'I', chr(195).chr(141) => 'I', | |
chr(195).chr(142) => 'I', chr(195).chr(143) => 'I', | |
chr(195).chr(144) => 'D', chr(195).chr(145) => 'N', | |
chr(195).chr(146) => 'O', chr(195).chr(147) => 'O', | |
chr(195).chr(148) => 'O', chr(195).chr(149) => 'O', | |
chr(195).chr(150) => 'O', chr(195).chr(153) => 'U', | |
chr(195).chr(154) => 'U', chr(195).chr(155) => 'U', | |
chr(195).chr(156) => 'U', chr(195).chr(157) => 'Y', | |
chr(195).chr(158) => 'TH',chr(195).chr(159) => 's', | |
chr(195).chr(160) => 'a', chr(195).chr(161) => 'a', | |
chr(195).chr(162) => 'a', chr(195).chr(163) => 'a', | |
chr(195).chr(164) => 'a', chr(195).chr(165) => 'a', | |
chr(195).chr(166) => 'ae',chr(195).chr(167) => 'c', | |
chr(195).chr(168) => 'e', chr(195).chr(169) => 'e', | |
chr(195).chr(170) => 'e', chr(195).chr(171) => 'e', | |
chr(195).chr(172) => 'i', chr(195).chr(173) => 'i', | |
chr(195).chr(174) => 'i', chr(195).chr(175) => 'i', | |
chr(195).chr(176) => 'd', chr(195).chr(177) => 'n', | |
chr(195).chr(178) => 'o', chr(195).chr(179) => 'o', | |
chr(195).chr(180) => 'o', chr(195).chr(181) => 'o', | |
chr(195).chr(182) => 'o', chr(195).chr(184) => 'o', | |
chr(195).chr(185) => 'u', chr(195).chr(186) => 'u', | |
chr(195).chr(187) => 'u', chr(195).chr(188) => 'u', | |
chr(195).chr(189) => 'y', chr(195).chr(190) => 'th', | |
chr(195).chr(191) => 'y', chr(195).chr(152) => 'O', | |
// Decompositions for Latin Extended-A | |
chr(196).chr(128) => 'A', chr(196).chr(129) => 'a', | |
chr(196).chr(130) => 'A', chr(196).chr(131) => 'a', | |
chr(196).chr(132) => 'A', chr(196).chr(133) => 'a', | |
chr(196).chr(134) => 'C', chr(196).chr(135) => 'c', | |
chr(196).chr(136) => 'C', chr(196).chr(137) => 'c', | |
chr(196).chr(138) => 'C', chr(196).chr(139) => 'c', | |
chr(196).chr(140) => 'C', chr(196).chr(141) => 'c', | |
chr(196).chr(142) => 'D', chr(196).chr(143) => 'd', | |
chr(196).chr(144) => 'D', chr(196).chr(145) => 'd', | |
chr(196).chr(146) => 'E', chr(196).chr(147) => 'e', | |
chr(196).chr(148) => 'E', chr(196).chr(149) => 'e', | |
chr(196).chr(150) => 'E', chr(196).chr(151) => 'e', | |
chr(196).chr(152) => 'E', chr(196).chr(153) => 'e', | |
chr(196).chr(154) => 'E', chr(196).chr(155) => 'e', | |
chr(196).chr(156) => 'G', chr(196).chr(157) => 'g', | |
chr(196).chr(158) => 'G', chr(196).chr(159) => 'g', | |
chr(196).chr(160) => 'G', chr(196).chr(161) => 'g', | |
chr(196).chr(162) => 'G', chr(196).chr(163) => 'g', | |
chr(196).chr(164) => 'H', chr(196).chr(165) => 'h', | |
chr(196).chr(166) => 'H', chr(196).chr(167) => 'h', | |
chr(196).chr(168) => 'I', chr(196).chr(169) => 'i', | |
chr(196).chr(170) => 'I', chr(196).chr(171) => 'i', | |
chr(196).chr(172) => 'I', chr(196).chr(173) => 'i', | |
chr(196).chr(174) => 'I', chr(196).chr(175) => 'i', | |
chr(196).chr(176) => 'I', chr(196).chr(177) => 'i', | |
chr(196).chr(178) => 'IJ',chr(196).chr(179) => 'ij', | |
chr(196).chr(180) => 'J', chr(196).chr(181) => 'j', | |
chr(196).chr(182) => 'K', chr(196).chr(183) => 'k', | |
chr(196).chr(184) => 'k', chr(196).chr(185) => 'L', | |
chr(196).chr(186) => 'l', chr(196).chr(187) => 'L', | |
chr(196).chr(188) => 'l', chr(196).chr(189) => 'L', | |
chr(196).chr(190) => 'l', chr(196).chr(191) => 'L', | |
chr(197).chr(128) => 'l', chr(197).chr(129) => 'L', | |
chr(197).chr(130) => 'l', chr(197).chr(131) => 'N', | |
chr(197).chr(132) => 'n', chr(197).chr(133) => 'N', | |
chr(197).chr(134) => 'n', chr(197).chr(135) => 'N', | |
chr(197).chr(136) => 'n', chr(197).chr(137) => 'N', | |
chr(197).chr(138) => 'n', chr(197).chr(139) => 'N', | |
chr(197).chr(140) => 'O', chr(197).chr(141) => 'o', | |
chr(197).chr(142) => 'O', chr(197).chr(143) => 'o', | |
chr(197).chr(144) => 'O', chr(197).chr(145) => 'o', | |
chr(197).chr(146) => 'OE',chr(197).chr(147) => 'oe', | |
chr(197).chr(148) => 'R',chr(197).chr(149) => 'r', | |
chr(197).chr(150) => 'R',chr(197).chr(151) => 'r', | |
chr(197).chr(152) => 'R',chr(197).chr(153) => 'r', | |
chr(197).chr(154) => 'S',chr(197).chr(155) => 's', | |
chr(197).chr(156) => 'S',chr(197).chr(157) => 's', | |
chr(197).chr(158) => 'S',chr(197).chr(159) => 's', | |
chr(197).chr(160) => 'S', chr(197).chr(161) => 's', | |
chr(197).chr(162) => 'T', chr(197).chr(163) => 't', | |
chr(197).chr(164) => 'T', chr(197).chr(165) => 't', | |
chr(197).chr(166) => 'T', chr(197).chr(167) => 't', | |
chr(197).chr(168) => 'U', chr(197).chr(169) => 'u', | |
chr(197).chr(170) => 'U', chr(197).chr(171) => 'u', | |
chr(197).chr(172) => 'U', chr(197).chr(173) => 'u', | |
chr(197).chr(174) => 'U', chr(197).chr(175) => 'u', | |
chr(197).chr(176) => 'U', chr(197).chr(177) => 'u', | |
chr(197).chr(178) => 'U', chr(197).chr(179) => 'u', | |
chr(197).chr(180) => 'W', chr(197).chr(181) => 'w', | |
chr(197).chr(182) => 'Y', chr(197).chr(183) => 'y', | |
chr(197).chr(184) => 'Y', chr(197).chr(185) => 'Z', | |
chr(197).chr(186) => 'z', chr(197).chr(187) => 'Z', | |
chr(197).chr(188) => 'z', chr(197).chr(189) => 'Z', | |
chr(197).chr(190) => 'z', chr(197).chr(191) => 's', | |
// Decompositions for Latin Extended-B | |
chr(200).chr(152) => 'S', chr(200).chr(153) => 's', | |
chr(200).chr(154) => 'T', chr(200).chr(155) => 't', | |
// Euro Sign | |
chr(226).chr(130).chr(172) => 'E', | |
// GBP (Pound) Sign | |
chr(194).chr(163) => '', | |
// Vowels with diacritic (Vietnamese) | |
// unmarked | |
chr(198).chr(160) => 'O', chr(198).chr(161) => 'o', | |
chr(198).chr(175) => 'U', chr(198).chr(176) => 'u', | |
// grave accent | |
chr(225).chr(186).chr(166) => 'A', chr(225).chr(186).chr(167) => 'a', | |
chr(225).chr(186).chr(176) => 'A', chr(225).chr(186).chr(177) => 'a', | |
chr(225).chr(187).chr(128) => 'E', chr(225).chr(187).chr(129) => 'e', | |
chr(225).chr(187).chr(146) => 'O', chr(225).chr(187).chr(147) => 'o', | |
chr(225).chr(187).chr(156) => 'O', chr(225).chr(187).chr(157) => 'o', | |
chr(225).chr(187).chr(170) => 'U', chr(225).chr(187).chr(171) => 'u', | |
chr(225).chr(187).chr(178) => 'Y', chr(225).chr(187).chr(179) => 'y', | |
// hook | |
chr(225).chr(186).chr(162) => 'A', chr(225).chr(186).chr(163) => 'a', | |
chr(225).chr(186).chr(168) => 'A', chr(225).chr(186).chr(169) => 'a', | |
chr(225).chr(186).chr(178) => 'A', chr(225).chr(186).chr(179) => 'a', | |
chr(225).chr(186).chr(186) => 'E', chr(225).chr(186).chr(187) => 'e', | |
chr(225).chr(187).chr(130) => 'E', chr(225).chr(187).chr(131) => 'e', | |
chr(225).chr(187).chr(136) => 'I', chr(225).chr(187).chr(137) => 'i', | |
chr(225).chr(187).chr(142) => 'O', chr(225).chr(187).chr(143) => 'o', | |
chr(225).chr(187).chr(148) => 'O', chr(225).chr(187).chr(149) => 'o', | |
chr(225).chr(187).chr(158) => 'O', chr(225).chr(187).chr(159) => 'o', | |
chr(225).chr(187).chr(166) => 'U', chr(225).chr(187).chr(167) => 'u', | |
chr(225).chr(187).chr(172) => 'U', chr(225).chr(187).chr(173) => 'u', | |
chr(225).chr(187).chr(182) => 'Y', chr(225).chr(187).chr(183) => 'y', | |
// tilde | |
chr(225).chr(186).chr(170) => 'A', chr(225).chr(186).chr(171) => 'a', | |
chr(225).chr(186).chr(180) => 'A', chr(225).chr(186).chr(181) => 'a', | |
chr(225).chr(186).chr(188) => 'E', chr(225).chr(186).chr(189) => 'e', | |
chr(225).chr(187).chr(132) => 'E', chr(225).chr(187).chr(133) => 'e', | |
chr(225).chr(187).chr(150) => 'O', chr(225).chr(187).chr(151) => 'o', | |
chr(225).chr(187).chr(160) => 'O', chr(225).chr(187).chr(161) => 'o', | |
chr(225).chr(187).chr(174) => 'U', chr(225).chr(187).chr(175) => 'u', | |
chr(225).chr(187).chr(184) => 'Y', chr(225).chr(187).chr(185) => 'y', | |
// acute accent | |
chr(225).chr(186).chr(164) => 'A', chr(225).chr(186).chr(165) => 'a', | |
chr(225).chr(186).chr(174) => 'A', chr(225).chr(186).chr(175) => 'a', | |
chr(225).chr(186).chr(190) => 'E', chr(225).chr(186).chr(191) => 'e', | |
chr(225).chr(187).chr(144) => 'O', chr(225).chr(187).chr(145) => 'o', | |
chr(225).chr(187).chr(154) => 'O', chr(225).chr(187).chr(155) => 'o', | |
chr(225).chr(187).chr(168) => 'U', chr(225).chr(187).chr(169) => 'u', | |
// dot below | |
chr(225).chr(186).chr(160) => 'A', chr(225).chr(186).chr(161) => 'a', | |
chr(225).chr(186).chr(172) => 'A', chr(225).chr(186).chr(173) => 'a', | |
chr(225).chr(186).chr(182) => 'A', chr(225).chr(186).chr(183) => 'a', | |
chr(225).chr(186).chr(184) => 'E', chr(225).chr(186).chr(185) => 'e', | |
chr(225).chr(187).chr(134) => 'E', chr(225).chr(187).chr(135) => 'e', | |
chr(225).chr(187).chr(138) => 'I', chr(225).chr(187).chr(139) => 'i', | |
chr(225).chr(187).chr(140) => 'O', chr(225).chr(187).chr(141) => 'o', | |
chr(225).chr(187).chr(152) => 'O', chr(225).chr(187).chr(153) => 'o', | |
chr(225).chr(187).chr(162) => 'O', chr(225).chr(187).chr(163) => 'o', | |
chr(225).chr(187).chr(164) => 'U', chr(225).chr(187).chr(165) => 'u', | |
chr(225).chr(187).chr(176) => 'U', chr(225).chr(187).chr(177) => 'u', | |
chr(225).chr(187).chr(180) => 'Y', chr(225).chr(187).chr(181) => 'y', | |
// Vowels with diacritic (Chinese, Hanyu Pinyin) | |
chr(201).chr(145) => 'a', | |
// macron | |
chr(199).chr(149) => 'U', chr(199).chr(150) => 'u', | |
// acute accent | |
chr(199).chr(151) => 'U', chr(199).chr(152) => 'u', | |
// caron | |
chr(199).chr(141) => 'A', chr(199).chr(142) => 'a', | |
chr(199).chr(143) => 'I', chr(199).chr(144) => 'i', | |
chr(199).chr(145) => 'O', chr(199).chr(146) => 'o', | |
chr(199).chr(147) => 'U', chr(199).chr(148) => 'u', | |
chr(199).chr(153) => 'U', chr(199).chr(154) => 'u', | |
// grave accent | |
chr(199).chr(155) => 'U', chr(199).chr(156) => 'u', | |
); | |
$string = strtr($string, $chars); | |
} else { | |
$chars = array(); | |
// Assume ISO-8859-1 if not UTF-8 | |
$chars['in'] = chr(128).chr(131).chr(138).chr(142).chr(154).chr(158) | |
.chr(159).chr(162).chr(165).chr(181).chr(192).chr(193).chr(194) | |
.chr(195).chr(196).chr(197).chr(199).chr(200).chr(201).chr(202) | |
.chr(203).chr(204).chr(205).chr(206).chr(207).chr(209).chr(210) | |
.chr(211).chr(212).chr(213).chr(214).chr(216).chr(217).chr(218) | |
.chr(219).chr(220).chr(221).chr(224).chr(225).chr(226).chr(227) | |
.chr(228).chr(229).chr(231).chr(232).chr(233).chr(234).chr(235) | |
.chr(236).chr(237).chr(238).chr(239).chr(241).chr(242).chr(243) | |
.chr(244).chr(245).chr(246).chr(248).chr(249).chr(250).chr(251) | |
.chr(252).chr(253).chr(255); | |
$chars['out'] = "EfSZszYcYuAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy"; | |
$string = strtr($string, $chars['in'], $chars['out']); | |
$double_chars = array(); | |
$double_chars['in'] = array(chr(140), chr(156), chr(198), chr(208), chr(222), chr(223), chr(230), chr(240), chr(254)); | |
$double_chars['out'] = array('OE', 'oe', 'AE', 'DH', 'TH', 'ss', 'ae', 'dh', 'th'); | |
$string = str_replace($double_chars['in'], $double_chars['out'], $string); | |
} | |
return $string; | |
} | |
/** | |
* Seems urf-8 | |
* @param $str | |
* @return bool | |
*/ | |
static function seems_utf8( $str ) { | |
$length = mb_strlen($str); | |
for ($i=0; $i < $length; $i++) { | |
$c = ord($str[$i]); | |
if ($c < 0x80) $n = 0; // 0bbbbbbb | |
elseif (($c & 0xE0) == 0xC0) $n=1; // 110bbbbb | |
elseif (($c & 0xF0) == 0xE0) $n=2; // 1110bbbb | |
elseif (($c & 0xF8) == 0xF0) $n=3; // 11110bbb | |
elseif (($c & 0xFC) == 0xF8) $n=4; // 111110bb | |
elseif (($c & 0xFE) == 0xFC) $n=5; // 1111110b | |
else return false; // Does not match any model | |
for ($j=0; $j<$n; $j++) { // n bytes matching 10bbbbbb follow ? | |
if ((++$i == $length) || ((ord($str[$i]) & 0xC0) != 0x80)) | |
return false; | |
} | |
} | |
return true; | |
} | |
/** | |
* Create friendly slug | |
* | |
* @param string $str title or a string want to create friendly slug | |
* @return string friendly slug | |
*/ | |
static function friendlySlug($str) { | |
$str = static::vi2en($str); | |
$str = strip_tags($str); | |
// Preserve escaped octets. | |
$str = preg_replace('|%([a-fA-F0-9][a-fA-F0-9])|', '---$1---', $str); | |
// Remove percent signs that are not part of an octet. | |
$str = str_replace('%', '', $str); | |
// Restore octets. | |
$str = preg_replace('|---([a-fA-F0-9][a-fA-F0-9])---|', '%$1', $str); | |
$str = static::remove_accents($str); | |
if (static::seems_utf8($str)) { | |
if (function_exists('mb_strtolower')) { | |
$str = mb_strtolower($str, 'UTF-8'); | |
} | |
$str = static::utf8_uri_encode($str, 200); | |
} | |
$str = strtolower($str); | |
$str = preg_replace('/&.+?;/', '', $str); // kill entities | |
$str = str_replace('.', '-', $str); | |
$str = preg_replace('/[^%a-z0-9 _-]/', '', $str); | |
$str = preg_replace('/\s+/', '-', $str); | |
$str = preg_replace('|-+|', '-', $str); | |
$str = trim($str, '-'); | |
$str = static::vi2en($str); | |
return $str; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment