Created
September 20, 2015 07:56
-
-
Save ivuorinen/bce25b37d3d3aadd634e to your computer and use it in GitHub Desktop.
psr2-fixed, docblocked and 80-char width safe version
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* makeSafeEntities() | |
* | |
* Convert str to UTF-8 (if not already), then convert that to HTML named | |
* entities and numbered references. | |
* Compare to native htmlentities() function. Unlike that function, | |
* this will skip any already existing entities in the string. | |
* | |
* - mb_convert_encoding() doesn't encode ampersands, so use | |
* makeAmpersandEntities to convert those. | |
* - mb_convert_encoding() won't usually convert to illegal | |
* numbered entities (128-159) unless there's a charset discrepancy, | |
* but just in case, correct them with correctIllegalEntities. | |
* | |
* @author Cameron Clark <[email protected]> | |
* @see http://www.prolifique.com/entities.php.txt | |
* | |
* @param string|array $str | |
* @param bool $convertTags | |
* @param string $encoding | |
* | |
* @return string|array | |
*/ | |
function makeSafeEntities($str, $convertTags = 0, $encoding = "") | |
{ | |
if (empty($str)) { | |
return ''; | |
} | |
if (is_array($arrOutput = $str)) { | |
$arrOutput = array(); | |
foreach (array_keys($arrOutput) as $key) { | |
$arrOutput[$key] = makeSafeEntities( | |
$arrOutput[$key], | |
$encoding | |
); | |
} | |
return $arrOutput; | |
} | |
$str = makeUTF8($str, $encoding); | |
$str = mb_convert_encoding( | |
$str, | |
"HTML-ENTITIES", | |
"UTF-8" | |
); | |
$str = makeAmpersandEntities($str); | |
if ($convertTags) { | |
$str = makeTagEntities($str); | |
} | |
$str = correctIllegalEntities($str); | |
return $str; | |
} | |
/** | |
* makeAllEntities() | |
* | |
* Convert str to UTF-8 (if not already), then convert to HTML numbered | |
* decimal entities. If selected, it first converts any illegal chars to | |
* safe named (and numbered) entities as in makeSafeEntities(). | |
* Unlike mb_convert_encoding(), mb_encode_numericentity() will NOT skip | |
* any already existing entities in the string, so use a regex to skip them. | |
* | |
* @author Cameron Clark <[email protected]> | |
* @see http://www.prolifique.com/entities.php.txt | |
* | |
* @param string $str String to convert | |
* @param bool $useNamedEntities True to use named entities | |
* @param string $encoding Encoding to use | |
* | |
* @return string | |
*/ | |
function makeAllEntities($str, $useNamedEntities = 0, $encoding = "") | |
{ | |
if (empty($str)) { | |
return ''; | |
} | |
if (is_array($str)) { | |
foreach ($str as $row) { | |
$arrOutput[] = makeAllEntities( | |
$row, | |
$encoding | |
); | |
} | |
return $arrOutput; | |
} | |
$str = makeUTF8($str, $encoding); | |
if ($useNamedEntities) { | |
$str = mb_convert_encoding( | |
$str, | |
"HTML-ENTITIES", | |
"UTF-8" | |
); | |
} | |
$str = makeTagEntities($str, $useNamedEntities); | |
// Fix backslashes so they don't screw up following mb_ereg_replace | |
// Single quotes are fixed by makeTagEntities() above | |
$str = mb_ereg_replace('\\\\', "\", $str); | |
mb_regex_encoding("UTF-8"); | |
$str = mb_ereg_replace( | |
"(?>(&(?:[a-z]{0,4}\w{2,3};|#\d{2,5};)))|(\S+?)", | |
"'\\1'.mb_encode_numericentity('\\2', " | |
. "array(0x0, 0x2FFFF, 0, 0xFFFF), 'UTF-8')", | |
$str, | |
"ime" | |
); | |
$str = correctIllegalEntities($str); | |
return $str; | |
} | |
/** | |
* makeTagEntities() | |
* | |
* Convert common characters to named or numbered entities | |
* | |
* @author Cameron Clark <[email protected]> | |
* @see http://www.prolifique.com/entities.php.txt | |
* | |
* @param string $str String to convert | |
* @param boolean $useNamedEntities Use "&" or "&" | |
* @return string | |
*/ | |
function makeTagEntities($str = '', $useNamedEntities = 1) | |
{ | |
// Note that we should use ' for the single quote, | |
// but IE doesn't like it | |
$arrReplace = $useNamedEntities | |
? array(''','"','<','>') | |
: array(''','"','<','>'); | |
return str_replace( | |
array("'", '"', '<', '>'), | |
$arrReplace, | |
$str | |
); | |
} | |
/** | |
* makeAmpersandEntities() | |
* | |
* Convert ampersands to named or numbered entities. | |
* Use regex to skip any that might be part of existing entities. | |
* | |
* @author Cameron Clark <[email protected]> | |
* @see http://www.prolifique.com/entities.php.txt | |
* | |
* @param string $str String to convert | |
* @param boolean $useNamedEntities Use "&" or "&" | |
* @return string | |
*/ | |
function makeAmpersandEntities($str = '', $useNamedEntities = 1) | |
{ | |
return preg_replace( | |
"/&(?![A-Za-z]{0,4}\w{2,3};|#[0-9]{2,5};)/m", | |
$useNamedEntities ? "&" : "&", | |
$str | |
); | |
} | |
/** | |
* correctIllegalEntities() | |
* | |
* Convert illegal HTML numbered entities in the | |
* range 128 - 159 to legal couterparts | |
* | |
* @author Cameron Clark <[email protected]> | |
* @see http://www.prolifique.com/entities.php.txt | |
* | |
* @param string $str String to correct | |
* @return string | |
*/ | |
function correctIllegalEntities($str = '') | |
{ | |
if (empty($str)) { | |
return ''; | |
} | |
$chars = array( | |
128 => '€', | |
130 => '‚', | |
131 => 'ƒ', | |
132 => '„', | |
133 => '…', | |
134 => '†', | |
135 => '‡', | |
136 => 'ˆ', | |
137 => '‰', | |
138 => 'Š', | |
139 => '‹', | |
140 => 'Œ', | |
142 => 'Ž', | |
145 => '‘', | |
146 => '’', | |
147 => '“', | |
148 => '”', | |
149 => '•', | |
150 => '–', | |
151 => '—', | |
152 => '˜', | |
153 => '™', | |
154 => 'š', | |
155 => '›', | |
156 => 'œ', | |
158 => 'ž', | |
159 => 'Ÿ' | |
); | |
foreach (array_keys($chars) as $num) { | |
$str = str_replace( | |
"&#" . $num . ";", | |
$chars[$num], | |
$str | |
); | |
} | |
return $str; | |
} | |
/** | |
* makeUTF8() | |
* | |
* Compare to native utf8_encode function, which will re-encode text that | |
* is already UTF-8. Some people have reported problems with this. | |
* You might consider rearranging the order here to try mb_detect_encoding | |
* first, then fall back to using isUTF8 if that doesn't work. | |
* | |
* @author Cameron Clark <[email protected]> | |
* @see http://www.prolifique.com/entities.php.txt | |
* | |
* @param string $str String to convert | |
* @param string $encoding Encoding to convert to | |
* | |
* @return string | |
*/ | |
function makeUTF8($str = '', $encoding = "") | |
{ | |
if (empty($str)) { | |
return ''; | |
} | |
if (empty($encoding) && isUTF8($str)) { | |
$encoding = "UTF-8"; | |
} | |
if (empty($encoding)) { | |
$encoding = mb_detect_encoding( | |
$str, | |
'UTF-8, ISO-8859-1' | |
); | |
} | |
if (empty($encoding)) { | |
// If charset can't be detected, default to ISO-8859-1 | |
$encoding = "ISO-8859-1"; | |
} | |
return ($encoding == "UTF-8" | |
? $str | |
: @mb_convert_encoding( | |
$str, | |
"UTF-8", | |
$encoding | |
) | |
); | |
} | |
/** | |
* isUTF8() | |
* | |
* Much simpler UTF-8-ness checker using a regular expression created | |
* by the W3C: Returns true if $string is valid UTF-8 and false otherwise. | |
* From http://w3.org/International/questions/qa-forms-utf-8.html | |
* | |
* @author Cameron Clark <[email protected]> | |
* @see http://www.prolifique.com/entities.php.txt | |
* | |
* @param string $string String to test | |
* @return boolean | |
*/ | |
function isUTF8($str = '') | |
{ | |
return preg_match( | |
'%^(?: | |
[\x09\x0A\x0D\x20-\x7E] # ASCII | |
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | |
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | |
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | |
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | |
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | |
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | |
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 | |
)*$%xs', | |
$str | |
); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment