Last active
February 7, 2024 11:47
-
-
Save ed-fruty/0a96031a58b6a6596b03d5706a127f56 to your computer and use it in GitHub Desktop.
PHP DESERIALIZE (UNSERIALIZE) STRING WITH NON UTF8 CHARACTERS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* If you have serialized data, which have non utf-8 characters, probably you'll have a problem in deserialization (unserialization). | |
* So, what we need to do is to unserialize out serialized data properly? | |
* 1. Replace all non utf-8 characters to '?' | |
* 2. Update serialized data length | |
* 3. Deserialize | |
* | |
* Let's go! | |
*/ | |
class Unserializer | |
{ | |
/** | |
* @var string | |
* @link https://magp.ie/2014/08/13/php-unserialize-string-after-non-utf8-characters-stripped-out/ | |
* @link https://magp.ie/2011/01/06/remove-non-utf8-characters-from-string-with-php/ | |
* | |
* For reject overly long 2 byte sequences, as well as characters above U+10000 and replace with ? you can use bellow regex | |
* '/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]'. | |
* '|[\x00-\x7F][\x80-\xBF]+'. | |
* '|([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*'. | |
* '|[\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})'. | |
* '|[\xE0-\xEF](([\x80-\xBF](?![\x80-\xBF]))|(?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/S' | |
* | |
* For reject overly long 3 byte sequences and UTF-16 surrogates and replace with ? you can use bellow regex: | |
* '/\xE0[\x80-\x9F][\x80-\xBF]'. | |
* '|\xED[\xA0-\xBF][\x80-\xBF]/S' | |
*----------------------------------------------------------------- | |
* | |
* At the stackoverflow (http://stackoverflow.com/questions/1176904) guys advice to use many variants of regex, like: | |
* '/[[:cntrl:]]+/' | |
* or | |
* '/[^[:print:]]/' | |
* | |
* You may use any one you want | |
*/ | |
protected $nonCharsRegex = '/[\x00-\x1F\x80-\xFF]/'; | |
/** | |
* Replace non chars to the property value. | |
* It can be null or any other. | |
* | |
* @var string | |
*/ | |
protected $replaceTo = '?'; | |
/** | |
* Unserialize serialized string. | |
* Firstly try to unserialize it using basic unserialization. | |
* If it's impossible, call additional functionality for the unserialization. | |
* | |
* @param string $message | |
* @return mixed | |
*/ | |
public function unserialize($message) | |
{ | |
$record = @unserialize($message); | |
/** @link http://php.net/manual/en/function.unserialize.php#refsect1-function.unserialize-notes */ | |
if ($message !== false && $record === false) { | |
$message = $this->replaceSpecialChars($message); | |
$message = $this->updateSerializedMessage($message); | |
$record = @unserialize($message); | |
} | |
return $record; | |
} | |
/** | |
* Remove non utf-8 chars. | |
* It matches anything in range 0-31, 128-255 and replace it to '?'. | |
* | |
* @param string $message | |
* @return string | |
*/ | |
protected function replaceSpecialChars($message) | |
{ | |
return preg_replace($this->nonCharsRegex, $this->replaceTo , $message); | |
} | |
/** | |
* Update serialized message (make it valid). | |
* | |
* @param string $message | |
* @return string | |
*/ | |
protected function updateSerializedMessage($message) | |
{ | |
return preg_replace_callback( | |
'!s:(\d+):"(.*?)";!s', | |
function ($matches) { | |
if (isset($matches[2])) { | |
return 's:'.strlen($matches[2]).':"'.$matches[2].'";'; | |
} | |
}, | |
$message | |
); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The preg_replacement will fail if there's a "; (double quote, semicolon) in the string.