-
-
Save xus/c796612553223ab9fa66a5420546637f to your computer and use it in GitHub Desktop.
A PHP class to cleanup strings to be UTF8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* Standardized data cleanup helper class */ | |
class Cleanup { | |
/** | |
* Make a string into UTF8 compliant... cleans funcky input characters | |
* @param mixed $str | |
* @return mixed $str | |
*/ | |
static function makeUTF8($str) { | |
if (is_array($str)) { | |
$r = array(); | |
foreach($str as $k => $v) { | |
$r[$k] = self::makeUTF8($v); | |
} | |
return $r; | |
} elseif (is_string($str)) { | |
$str = strtr($str, array( | |
// HTML entities | |
chr(195).chr(129) => 'Á', | |
chr(195).chr(161) => 'á', | |
chr(195).chr(130) => 'Â', | |
chr(195).chr(162) => 'â', | |
chr(194).chr(180) => '´', | |
chr(195).chr(134) => 'Æ', | |
chr(195).chr(166) => 'æ', | |
chr(195).chr(128) => 'À', | |
chr(195).chr(160) => 'à', | |
chr(226).chr(132).chr(181) => 'ℵ', | |
chr(206).chr(145) => 'Α', | |
chr(206).chr(177) => 'α', | |
chr(226).chr(136).chr(167) => '∧', | |
chr(226).chr(136).chr(160) => '∠', | |
chr(195).chr(133) => 'Å', | |
chr(195).chr(165) => 'å', | |
chr(226).chr(137).chr(136) => '≈', | |
chr(195).chr(131) => 'Ã', | |
chr(195).chr(163) => 'ã', | |
chr(195).chr(132) => 'Ä', | |
chr(195).chr(164) => 'ä', | |
chr(226).chr(128).chr(158) => '„', | |
chr(206).chr(146) => 'Β', | |
chr(206).chr(178) => 'β', | |
chr(194).chr(166) => '¦', | |
chr(226).chr(128).chr(162) => '•', | |
chr(226).chr(136).chr(169) => '∩', | |
chr(195).chr(135) => 'Ç', | |
chr(195).chr(167) => 'ç', | |
chr(194).chr(184) => '¸', | |
chr(194).chr(162) => '¢', | |
chr(206).chr(167) => 'Χ', | |
chr(207).chr(135) => 'χ', | |
chr(203).chr(134) => 'ˆ', | |
chr(226).chr(153).chr(163) => '♣', | |
chr(226).chr(137).chr(133) => '≅', | |
chr(194).chr(169) => '©', | |
chr(226).chr(134).chr(181) => '↵', | |
chr(226).chr(136).chr(170) => '∪', | |
chr(194).chr(164) => '¤', | |
chr(226).chr(128).chr(160) => '†', | |
chr(226).chr(128).chr(161) => '‡', | |
chr(226).chr(134).chr(147) => '↓', | |
chr(226).chr(135).chr(147) => '⇓', | |
chr(194).chr(176) => '°', | |
chr(206).chr(148) => 'Δ', | |
chr(206).chr(180) => 'δ', | |
chr(226).chr(153).chr(166) => '♦', | |
chr(195).chr(183) => '÷', | |
chr(195).chr(137) => 'É', | |
chr(195).chr(169) => 'é', | |
chr(195).chr(138) => 'Ê', | |
chr(195).chr(170) => 'ê', | |
chr(195).chr(136) => 'È', | |
chr(195).chr(168) => 'è', | |
chr(226).chr(136).chr(133) => '∅', | |
chr(226).chr(128).chr(131) => ' ', | |
chr(226).chr(128).chr(130) => ' ', | |
chr(206).chr(149) => 'Ε', | |
chr(206).chr(181) => 'ε', | |
chr(226).chr(137).chr(161) => '≡', | |
chr(206).chr(151) => 'Η', | |
chr(206).chr(183) => 'η', | |
chr(195).chr(144) => 'Ð', | |
chr(195).chr(176) => 'ð', | |
chr(195).chr(139) => 'Ë', | |
chr(195).chr(171) => 'ë', | |
chr(226).chr(130).chr(172) => '€', | |
chr(226).chr(136).chr(131) => '∃', | |
chr(198).chr(146) => 'ƒ', | |
chr(226).chr(136).chr(128) => '∀', | |
chr(194).chr(189) => '½', | |
chr(194).chr(188) => '¼', | |
chr(194).chr(190) => '¾', | |
chr(226).chr(129).chr(132) => '⁄', | |
chr(206).chr(147) => 'Γ', | |
chr(206).chr(179) => 'γ', | |
chr(226).chr(137).chr(165) => '≥', | |
chr(226).chr(134).chr(148) => '↔', | |
chr(226).chr(135).chr(148) => '⇔', | |
chr(226).chr(153).chr(165) => '♥', | |
chr(226).chr(128).chr(166) => '…', | |
chr(195).chr(141) => 'Í', | |
chr(195).chr(173) => 'í', | |
chr(195).chr(142) => 'Î', | |
chr(195).chr(174) => 'î', | |
chr(194).chr(161) => '¡', | |
chr(195).chr(140) => 'Ì', | |
chr(195).chr(172) => 'ì', | |
chr(226).chr(132).chr(145) => 'ℑ', | |
chr(226).chr(136).chr(158) => '∞', | |
chr(226).chr(136).chr(171) => '∫', | |
chr(206).chr(153) => 'Ι', | |
chr(206).chr(185) => 'ι', | |
chr(194).chr(191) => '¿', | |
chr(226).chr(136).chr(136) => '∈', | |
chr(195).chr(143) => 'Ï', | |
chr(195).chr(175) => 'ï', | |
chr(206).chr(154) => 'Κ', | |
chr(206).chr(186) => 'κ', | |
chr(206).chr(155) => 'Λ', | |
chr(206).chr(187) => 'λ', | |
chr(226).chr(140).chr(169) => '⟨', | |
chr(194).chr(171) => '«', | |
chr(226).chr(134).chr(144) => '←', | |
chr(226).chr(135).chr(144) => '⇐', | |
chr(226).chr(140).chr(136) => '⌈', | |
chr(226).chr(128).chr(156) => '“', | |
chr(226).chr(137).chr(164) => '≤', | |
chr(226).chr(140).chr(138) => '⌊', | |
chr(226).chr(136).chr(151) => '∗', | |
chr(226).chr(151).chr(138) => '◊', | |
chr(226).chr(128).chr(142) => '‎', | |
chr(226).chr(128).chr(185) => '‹', | |
chr(226).chr(128).chr(152) => '‘', | |
chr(194).chr(175) => '¯', | |
chr(226).chr(128).chr(148) => '—', | |
chr(194).chr(181) => 'µ', | |
chr(194).chr(183) => '·', | |
chr(226).chr(136).chr(146) => '−', | |
chr(206).chr(156) => 'Μ', | |
chr(206).chr(188) => 'μ', | |
chr(226).chr(136).chr(135) => '∇', | |
chr(194).chr(160) => ' ', | |
chr(226).chr(128).chr(147) => '–', | |
chr(226).chr(137).chr(160) => '≠', | |
chr(226).chr(136).chr(139) => '∋', | |
chr(194).chr(172) => '¬', | |
chr(226).chr(136).chr(137) => '∉', | |
chr(226).chr(138).chr(132) => '⊄', | |
chr(195).chr(145) => 'Ñ', | |
chr(195).chr(177) => 'ñ', | |
chr(206).chr(157) => 'Ν', | |
chr(206).chr(189) => 'ν', | |
chr(195).chr(147) => 'Ó', | |
chr(195).chr(179) => 'ó', | |
chr(195).chr(148) => 'Ô', | |
chr(195).chr(180) => 'ô', | |
chr(197).chr(146) => 'Œ', | |
chr(197).chr(147) => 'œ', | |
chr(195).chr(146) => 'Ò', | |
chr(195).chr(178) => 'ò', | |
chr(226).chr(128).chr(190) => '‾', | |
chr(206).chr(169) => 'Ω', | |
chr(207).chr(137) => 'ω', | |
chr(206).chr(159) => 'Ο', | |
chr(206).chr(191) => 'ο', | |
chr(226).chr(138).chr(149) => '⊕', | |
chr(226).chr(136).chr(168) => '∨', | |
chr(194).chr(170) => 'ª', | |
chr(194).chr(186) => 'º', | |
chr(195).chr(152) => 'Ø', | |
chr(195).chr(184) => 'ø', | |
chr(195).chr(149) => 'Õ', | |
chr(195).chr(181) => 'õ', | |
chr(226).chr(138).chr(151) => '⊗', | |
chr(195).chr(150) => 'Ö', | |
chr(195).chr(182) => 'ö', | |
chr(194).chr(182) => '¶', | |
chr(226).chr(136).chr(130) => '∂', | |
chr(226).chr(128).chr(176) => '‰', | |
chr(226).chr(138).chr(165) => '⊥', | |
chr(206).chr(166) => 'Φ', | |
chr(207).chr(134) => 'φ', | |
chr(206).chr(160) => 'Π', | |
chr(207).chr(128) => 'π', | |
chr(207).chr(150) => 'ϖ', | |
chr(194).chr(177) => '±', | |
chr(194).chr(163) => '£', | |
chr(226).chr(128).chr(178) => '′', | |
chr(226).chr(128).chr(179) => '″', | |
chr(226).chr(136).chr(143) => '∏', | |
chr(226).chr(136).chr(157) => '∝', | |
chr(206).chr(168) => 'Ψ', | |
chr(207).chr(136) => 'ψ', | |
chr(226).chr(136).chr(154) => '√', | |
chr(226).chr(140).chr(170) => '⟩', | |
chr(194).chr(187) => '»', | |
chr(226).chr(134).chr(146) => '→', | |
chr(226).chr(135).chr(146) => '⇒', | |
chr(226).chr(140).chr(137) => '⌉', | |
chr(226).chr(128).chr(157) => '”', | |
chr(226).chr(132).chr(156) => 'ℜ', | |
chr(194).chr(174) => '®', | |
chr(226).chr(140).chr(139) => '⌋', | |
chr(206).chr(161) => 'Ρ', | |
chr(207).chr(129) => 'ρ', | |
chr(226).chr(128).chr(143) => '‏', | |
chr(226).chr(128).chr(186) => '›', | |
chr(226).chr(128).chr(153) => '’', | |
chr(226).chr(128).chr(154) => '‚', | |
chr(197).chr(160) => 'Š', | |
chr(197).chr(161) => 'š', | |
chr(226).chr(139).chr(133) => '⋅', | |
chr(194).chr(167) => '§', | |
chr(194).chr(173) => '­', | |
chr(206).chr(163) => 'Σ', | |
chr(207).chr(131) => 'σ', | |
chr(207).chr(130) => 'ς', | |
chr(226).chr(136).chr(188) => '∼', | |
chr(226).chr(153).chr(160) => '♠', | |
chr(226).chr(138).chr(130) => '⊂', | |
chr(226).chr(138).chr(134) => '⊆', | |
chr(226).chr(136).chr(145) => '∑', | |
chr(194).chr(185) => '¹', | |
chr(194).chr(178) => '²', | |
chr(194).chr(179) => '³', | |
chr(226).chr(138).chr(131) => '⊃', | |
chr(226).chr(138).chr(135) => '⊇', | |
chr(195).chr(159) => 'ß', | |
chr(206).chr(164) => 'Τ', | |
chr(207).chr(132) => 'τ', | |
chr(226).chr(136).chr(180) => '∴', | |
chr(206).chr(152) => 'Θ', | |
chr(206).chr(184) => 'θ', | |
chr(207).chr(145) => 'ϑ', | |
chr(226).chr(128).chr(137) => ' ', | |
chr(195).chr(158) => 'Þ', | |
chr(195).chr(190) => 'þ', | |
chr(203).chr(156) => '˜', | |
chr(195).chr(151) => '×', | |
chr(226).chr(132).chr(162) => '™', | |
chr(195).chr(154) => 'Ú', | |
chr(195).chr(186) => 'ú', | |
chr(226).chr(134).chr(145) => '↑', | |
chr(226).chr(135).chr(145) => '⇑', | |
chr(195).chr(155) => 'Û', | |
chr(195).chr(187) => 'û', | |
chr(195).chr(153) => 'Ù', | |
chr(195).chr(185) => 'ù', | |
chr(194).chr(168) => '¨', | |
chr(207).chr(146) => 'ϒ', | |
chr(206).chr(165) => 'Υ', | |
chr(207).chr(133) => 'υ', | |
chr(195).chr(156) => 'Ü', | |
chr(195).chr(188) => 'ü', | |
chr(226).chr(132).chr(152) => '℘', | |
chr(206).chr(158) => 'Ξ', | |
chr(206).chr(190) => 'ξ', | |
chr(195).chr(157) => 'Ý', | |
chr(195).chr(189) => 'ý', | |
chr(194).chr(165) => '¥', | |
chr(195).chr(191) => 'ÿ', | |
chr(197).chr(184) => 'Ÿ', | |
chr(206).chr(150) => 'Ζ', | |
chr(206).chr(182) => 'ζ', | |
chr(226).chr(128).chr(141) => '‍', | |
chr(226).chr(128).chr(140) => '‌', | |
// standard translations (legacy) | |
chr(225) => 'á', chr(228) => 'ä', chr(232) => 'č', chr(239) => 'ď', | |
chr(233) => 'é', | |
chr(236) => 'ê', | |
chr(237) => 'í', chr(229) => 'ĺ', chr(229) => 'ľ', | |
chr(242) => 'ň', chr(244) => 'ô', chr(243) => 'ó', chr(154) => 'š', chr(248) => 'ř', | |
chr(250) => 'ú', chr(249) => 'ů', chr(157) => 'ť', chr(253) => 'ý', chr(158) => 'ž', | |
chr(193) => 'Á', chr(196) => 'Ä', chr(200) => 'Č', chr(207) => 'Ď', chr(201) => 'É', | |
chr(204) => 'Ě', chr(205) => 'Í', chr(197) => 'Ĺ', chr(188) => 'Ľ', chr(210) => 'Ň', | |
chr(212) => 'Ô', chr(211) => 'Ó', chr(138) => 'Š', chr(216) => 'Ř', chr(218) => 'Ú', | |
chr(217) => 'Ů', chr(141) => 'Ť', chr(221) => 'Ý', chr(142) => 'Ž', | |
// phonetic alphabet | |
chr(240) => 'ð', | |
chr(230) => 'æ', | |
// other funky translations | |
chr(160) => ' ', | |
chr(150) => '-')); | |
$str = preg_replace(array( | |
'/[\x60\x82\x91\x92\xb4\xb8]/i', // single quotes | |
'/[\x84\x93\x94]/i', // double quotes | |
'/[\x85]/i', // ellipsis ... | |
'/[\x00-\x0d\x0b\x0c\x0e-\x1f\x7f-\x9f]/i' // all other non-ascii | |
), array( | |
'\'', | |
'"', | |
'...', | |
'' | |
), $str); | |
if (is_string($str) && self::detectUTF8($str)) { | |
$str=@str_replace("\xE2\x82\xAC", "€", $str); | |
$str=@iconv("UTF-8", "ISO-8859-1//TRANSLIT", $str); | |
$str=preg_replace("/[^\x9\xA\xD\x20-\x7F]/", "", $str); | |
} | |
} | |
return $str; | |
} | |
/** | |
* Cleans a string to UTF8, and then tries to translate common entities to plain text | |
* @link http://www.w3schools.com/tags/ref_entities.asp | |
*/ | |
static function makeUTF8plain($str) { | |
$str = self::makeUTF8($str); | |
$str = strtr($str, array( | |
'À' => 'A', | |
'Á' => 'A', | |
'Â' => 'A', | |
'Ã' => 'A', | |
'Ä' => 'A', | |
'Å' => 'A', | |
'Æ' => 'A', | |
'à' => 'a', | |
'á' => 'a', | |
'â' => 'a', | |
'ã' => 'a', | |
'ä' => 'a', | |
'å' => 'a', | |
'æ' => 'a', | |
'É' => 'E', | |
'È' => 'E', | |
'Ê' => 'E', | |
'Ë' => 'E', | |
'é' => 'e', | |
'è' => 'e', | |
'ê' => 'e', | |
'ë' => 'e', | |
'í' => 'i', | |
'ì' => 'i', | |
'î' => 'i', | |
'ï' => 'i', | |
'í' => 'i', | |
'ì' => 'i', | |
'î' => 'i', | |
'ï' => 'i', | |
'Ò' => 'O', | |
'Ó' => 'O', | |
'Ô' => 'O', | |
'Õ' => 'O', | |
'Ö' => 'O', | |
'Ø' => 'O', | |
'ò' => 'o', | |
'ó' => 'o', | |
'ô' => 'o', | |
'õ' => 'o', | |
'ö' => 'o', | |
'ø' => 'o', | |
'Ù' => 'U', | |
'Ú' => 'U', | |
'Û' => 'U', | |
'Ũ' => 'U', | |
'Ü' => 'U', | |
'ù' => 'u', | |
'ú' => 'u', | |
'û' => 'u', | |
'ũ' => 'u', | |
'ü' => 'u', | |
'Ñ' => 'N', | |
'ñ' => 'n', | |
)); | |
return $str; | |
} | |
/** | |
* Checks if a string is UTF8 compliant | |
* @param string $str | |
* @return bool | |
*/ | |
static function detectUTF8($str) { | |
return preg_match('%(?:'. | |
'[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte | |
'|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs | |
'|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte | |
'|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates | |
'|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 | |
'|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 | |
'|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 | |
')+%xs', $str); | |
} | |
/** | |
* Checks if a string is UTF8 compliant | |
* @param string $str | |
* @return bool | |
*/ | |
static function isUTF8($str) { | |
if ($str === mb_convert_encoding(mb_convert_encoding($str, "UTF-32", "UTF-8"), "UTF-8", "UTF-32")) { | |
return true; | |
} else { | |
return false; | |
} | |
} | |
/** | |
* This is a helpful tool to check each character in a string and see if we can identify it, for translation/conversion. | |
* @param string $str | |
* @return bool | |
*/ | |
static function dumpChr($str) { | |
$term = ''; | |
$nodes = array(); | |
foreach ( str_split($str) as $s ) { | |
if ($s == "\r" || $s == "\n" || $s == "\r\n") { | |
// ignore | |
} elseif (in_array($s, array('\\', '/')) || preg_match('/[a-zA-Z0-9\s\t \#\&\;\t\s\<\>\.\,\;\:\'\"\-\_\?\!]/', $s)) { | |
$term.= $s; | |
} else { | |
$nodes[] = $term; | |
$nodes[] = "[$s] => ".ord($s); | |
$term = ''; | |
} | |
} | |
$nodes[] = $term; | |
$nodes = array_diff($nodes, array('', ' ', "\n")); | |
return $nodes; | |
} | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment