Skip to content

Instantly share code, notes, and snippets.

@TimSC
Last active January 31, 2018 06:51
Show Gist options
  • Save TimSC/df8ccab10400c5a9fc9ea0df1c1db7aa to your computer and use it in GitHub Desktop.
Save TimSC/df8ccab10400c5a9fc9ea0df1c1db7aa to your computer and use it in GitHub Desktop.
Encode to ucs-2 by using utf_16_be then checking for invalid results
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import print_function
import struct
def CheckUcs2BEIsValid(e, big_endian=True):
#UCS-2 is a fixed width encoding. Therefore, check that the
#variable width aspect of UTF-16 is not used.
if big_endian:
struct_code = ">H"
else:
struct_code = "<H"
for i in range(len(e))[::2]:
val = struct.unpack(struct_code, e[i:i+2])[0]
val2 = (val & 0xfc00)
if val2 in [0xd800, 0xdc00]:
raise ValueError("UTF-16 detected")
def CheckUcs2BEIsValid_Method2(e, s_len):
#UCS-2 is a fixed width encoding. Therefore, check that the
#variable width aspect of UTF-16 is not used.
if len(e) != s_len*2:
raise ValueError("Variable width detected")
if __name__=="__main__":
tests = ["abc$", "день€", "𐐷", "😀"]
for s in tests:
try:
#UTF-16 is an extension of UCS-2, so start by using that encoding
e = s.encode("utf_16_be")
#Then check the result is valud UCS-2
CheckUcs2BEIsValid(e)
print ("{} is ok".format(s))
except ValueError as err:
print ("{} in {}, this isn't valid UCS-2".format(err, s))
for s in tests:
try:
e = s.encode("utf_16_le")
CheckUcs2BEIsValid(e, False)
print ("{} is ok".format(s))
except ValueError as err:
print ("{} in {}, this isn't valid UCS-2".format(err, s))
for s in tests:
try:
e = s.encode("utf_16_be")
CheckUcs2BEIsValid_Method2(e, len(s))
print ("{} is ok".format(s))
except ValueError as err:
print ("{} in {}, this isn't valid UCS-2".format(err, s))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment