Last active
March 1, 2022 06:16
-
-
Save zethzeth/32f927009af47ce2b69bda7cf7e4a147 to your computer and use it in GitHub Desktop.
Decontaminate text in PHP, preparing for json_decode
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Text cleaner | |
* | |
* Primary sources: | |
* - https://stackoverflow.com/questions/17219916/json-decode-returns-json-error-syntax-but-online-formatter-says-the-json-is-ok | |
* - https://stackoverflow.com/questions/2348152/detect-bad-json-data-in-php-json-decode | |
*/ | |
function decontaminate_text( | |
$text, | |
$remove_tags = true, | |
$remove_line_breaks = true, | |
$remove_BOM = true, | |
$ensure_utf8_encoding = true, | |
$ensure_quotes_are_properly_displayed = true, | |
$decode_html_entities = true | |
){ | |
if ( '' != $text && is_string( $text ) ) { | |
$text = preg_replace( '@<(script|style)[^>]*?>.*?</\\1>@si', '', $text ); | |
$text = str_replace(']]>', ']]>', $text); | |
if( $remove_tags ){ | |
// Which tags to allow (none!) | |
// $text = strip_tags($text, '<p>,<strong>,<span>,<a>'); | |
$text = strip_tags($text, ''); | |
} | |
if( $remove_line_breaks ){ | |
$text = preg_replace('/[\r\n\t ]+/', ' ', $text); | |
$text = trim( $text ); | |
} | |
if( $remove_BOM ){ | |
// Source: https://stackoverflow.com/a/31594983/1766219 | |
if( 0 === strpos( bin2hex( $text ), 'efbbbf' ) ){ | |
$text = substr( $text, 3 ); | |
} | |
} | |
if( $ensure_utf8_encoding ){ | |
// Check if UTF8-encoding | |
if( utf8_encode( utf8_decode( $text ) ) != $text ){ | |
$text = mb_convert_encoding( $text, 'utf-8', 'utf-8' ); | |
} | |
} | |
if( $ensure_quotes_are_properly_displayed ){ | |
$text = str_replace('"', '"', $text); | |
} | |
if( $decode_html_entities ){ | |
$text = html_entity_decode( $text ); | |
} | |
/** | |
* Other things to try | |
* - the chr-function: https://stackoverflow.com/a/20845642/1766219 | |
* - stripslashes (THIS ONE BROKE MY JSON DECODING, AFTER IT STARTED WORKING, THOUGH): https://stackoverflow.com/a/28540745/1766219 | |
* - This (improved?) JSON-decoder didn't help me, but it sure looks fancy: https://stackoverflow.com/a/43694325/1766219 | |
*/ | |
} | |
return $text; | |
} | |
// Example use | |
$some_text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed in turpis dui. Maecenas venenatis venenatis facilisis. Quisque dictum, diam consequat mollis hendraerit, orci tellus aliquet nisl, ut molestie leo augue at est. In vitae vehicula lectus. Curabitur ac varius ligula. Pellentesque orci urdna.'; | |
$some_text = decontaminate_text( $some_text ); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment