Last active
January 31, 2018 06:51
-
-
Save TimSC/df8ccab10400c5a9fc9ea0df1c1db7aa to your computer and use it in GitHub Desktop.
Encode to ucs-2 by using utf_16_be then checking for invalid results
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from __future__ import unicode_literals | |
from __future__ import print_function | |
import struct | |
def CheckUcs2BEIsValid(e, big_endian=True): | |
#UCS-2 is a fixed width encoding. Therefore, check that the | |
#variable width aspect of UTF-16 is not used. | |
if big_endian: | |
struct_code = ">H" | |
else: | |
struct_code = "<H" | |
for i in range(len(e))[::2]: | |
val = struct.unpack(struct_code, e[i:i+2])[0] | |
val2 = (val & 0xfc00) | |
if val2 in [0xd800, 0xdc00]: | |
raise ValueError("UTF-16 detected") | |
def CheckUcs2BEIsValid_Method2(e, s_len): | |
#UCS-2 is a fixed width encoding. Therefore, check that the | |
#variable width aspect of UTF-16 is not used. | |
if len(e) != s_len*2: | |
raise ValueError("Variable width detected") | |
if __name__=="__main__": | |
tests = ["abc$", "день€", "𐐷", "😀"] | |
for s in tests: | |
try: | |
#UTF-16 is an extension of UCS-2, so start by using that encoding | |
e = s.encode("utf_16_be") | |
#Then check the result is valud UCS-2 | |
CheckUcs2BEIsValid(e) | |
print ("{} is ok".format(s)) | |
except ValueError as err: | |
print ("{} in {}, this isn't valid UCS-2".format(err, s)) | |
for s in tests: | |
try: | |
e = s.encode("utf_16_le") | |
CheckUcs2BEIsValid(e, False) | |
print ("{} is ok".format(s)) | |
except ValueError as err: | |
print ("{} in {}, this isn't valid UCS-2".format(err, s)) | |
for s in tests: | |
try: | |
e = s.encode("utf_16_be") | |
CheckUcs2BEIsValid_Method2(e, len(s)) | |
print ("{} is ok".format(s)) | |
except ValueError as err: | |
print ("{} in {}, this isn't valid UCS-2".format(err, s)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment