Created
August 25, 2009 10:05
-
-
Save reservationlive/174613 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* ------------------------------------------------------------------------- */ | |
// Convert str to UTF-8 (if not already), then convert that to HTML named entities. | |
// and numbered references. Compare to native htmlentities() function. | |
// Unlike that function, this will skip any already existing entities in the string. | |
// mb_convert_encoding() doesn't encode ampersands, so use makeAmpersandEntities to convert those. | |
// mb_convert_encoding() won't usually convert to illegal numbered entities (128-159) unless | |
// there's a charset discrepancy, but just in case, correct them with correctIllegalEntities. | |
function makeSafeEntities($str, $convertTags = 0, $encoding = "") { | |
if (is_array($arrOutput = $str)) { | |
foreach (array_keys($arrOutput) as $key) | |
$arrOutput[$key] = makeSafeEntities($arrOutput[$key],$encoding); | |
return $arrOutput; | |
} | |
else if ($str !== "") { | |
$str = makeUTF8($str,$encoding); | |
$str = mb_convert_encoding($str,"HTML-ENTITIES","UTF-8"); | |
$str = makeAmpersandEntities($str); | |
if ($convertTags) | |
$str = makeTagEntities($str); | |
$str = correctIllegalEntities($str); | |
return $str; | |
} | |
} | |
// Convert str to UTF-8 (if not already), then convert to HTML numbered decimal entities. | |
// If selected, it first converts any illegal chars to safe named (and numbered) entities | |
// as in makeSafeEntities(). Unlike mb_convert_encoding(), mb_encode_numericentity() will | |
// NOT skip any already existing entities in the string, so use a regex to skip them. | |
function makeAllEntities($str, $useNamedEntities = 0, $encoding = "") { | |
if (is_array($str)) { | |
foreach ($str as $s) | |
$arrOutput[] = makeAllEntities($s,$encoding); | |
return $arrOutput; | |
} | |
else if ($str !== "") { | |
$str = makeUTF8($str,$encoding); | |
if ($useNamedEntities) | |
$str = mb_convert_encoding($str,"HTML-ENTITIES","UTF-8"); | |
$str = makeTagEntities($str,$useNamedEntities); | |
// Fix backslashes so they don't screw up following mb_ereg_replace | |
// Single quotes are fixed by makeTagEntities() above | |
$str = mb_ereg_replace('\\\\',"\", $str); | |
mb_regex_encoding("UTF-8"); | |
$str = mb_ereg_replace("(?>(&(?:[a-z]{0,4}\w{2,3};|#\d{2,5};)))|(\S+?)", | |
"'\\1'.mb_encode_numericentity('\\2',array(0x0,0x2FFFF,0,0xFFFF),'UTF-8')", $str, "ime"); | |
$str = correctIllegalEntities($str); | |
return $str; | |
} | |
} | |
// Convert common characters to named or numbered entities | |
function makeTagEntities($str, $useNamedEntities = 1) { | |
// Note that we should use ' for the single quote, but IE doesn't like it | |
$arrReplace = $useNamedEntities ? array(''','"','<','>') : array(''','"','<','>'); | |
return str_replace(array("'",'"','<','>'), $arrReplace, $str); | |
} | |
// Convert ampersands to named or numbered entities. | |
// Use regex to skip any that might be part of existing entities. | |
function makeAmpersandEntities($str, $useNamedEntities = 1) { | |
return preg_replace("/&(?![A-Za-z]{0,4}\w{2,3};|#[0-9]{2,5};)/m", $useNamedEntities ? "&" : "&", $str); | |
} | |
// Convert illegal HTML numbered entities in the range 128 - 159 to legal couterparts | |
function correctIllegalEntities($str) { | |
$chars = array( | |
128 => '€', | |
130 => '‚', | |
131 => 'ƒ', | |
132 => '„', | |
133 => '…', | |
134 => '†', | |
135 => '‡', | |
136 => 'ˆ', | |
137 => '‰', | |
138 => 'Š', | |
139 => '‹', | |
140 => 'Œ', | |
142 => 'Ž', | |
145 => '‘', | |
146 => '’', | |
147 => '“', | |
148 => '”', | |
149 => '•', | |
150 => '–', | |
151 => '—', | |
152 => '˜', | |
153 => '™', | |
154 => 'š', | |
155 => '›', | |
156 => 'œ', | |
158 => 'ž', | |
159 => 'Ÿ'); | |
foreach (array_keys($chars) as $num) | |
$str = str_replace("&#".$num.";", $chars[$num], $str); | |
return $str; | |
} | |
// Compare to native utf8_encode function, which will re-encode text that is already UTF-8 | |
function makeUTF8($str,$encoding = "") { | |
if ($str !== "") { | |
if (empty($encoding) && isUTF8($str)) | |
$encoding = "UTF-8"; | |
if (empty($encoding)) | |
$encoding = mb_detect_encoding($str,'UTF-8, ISO-8859-1'); | |
if (empty($encoding)) | |
$encoding = "ISO-8859-1"; // if charset can't be detected, default to ISO-8859-1 | |
return $encoding == "UTF-8" ? $str : @mb_convert_encoding($str,"UTF-8",$encoding); | |
} | |
} | |
// Much simpler UTF-8-ness checker using a regular expression created by the W3C: | |
// Returns true if $string is valid UTF-8 and false otherwise. | |
// From http://w3.org/International/questions/qa-forms-utf-8.html | |
function isUTF8($str) { | |
return preg_match('%^(?: | |
[\x09\x0A\x0D\x20-\x7E] # ASCII | |
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | |
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | |
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | |
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | |
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | |
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | |
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 | |
)*$%xs', $str); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment