Last active
July 14, 2022 19:44
-
-
Save theredpea/9772384 to your computer and use it in GitHub Desktop.
Understanding UTF-8 Encoding algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# - - coding: utf-8 - - | |
# This class http://www.w3.org/International/URLUTF8Encoder.java | |
# In Python | |
# To understand how | |
# 新 ("xin", 1st tone, means "new" in Mandarin) | |
# U+65B0 (Unicode codepoint http://unicode-table.com/en/search/?q=%E6%96%B0) | |
# %E6%96%B0 (URL encoding in UTF-8 http://www.url-encode-decode.com/) | |
# Interesting: Not fixed-width! "新" not same size as "T" | |
# Also remember Unicode != Encodings http://www.joelonsoftware.com/articles/Unicode.html | |
relevant_functions ={f.__name__:f.__doc__ for f in (ord, chr, unichr)} | |
relevant_links = ('http://stackoverflow.com/a/227472/1175496',) | |
relevant_knowledge =('0x7F == 0x007F', | |
'0x7f == 0x7F', | |
'0x7F == 127', | |
""">>>#Right-shifting divides by two then floors | |
>>> int(bin(10),2) | |
10 | |
>>> int(bin(10>>1),2) | |
5 | |
>>> int(bin(10>>2),2) | |
2 | |
>>> int(bin(10>>3),2) | |
1 | |
>>> int(bin(10>>4),2) | |
0""") | |
'=='.join(relevant_knowledge) | |
hex_range = '0123456789abcdef' | |
hex_gen = ('%'+f+s for f in hex_range for s in hex_range) | |
hex_tuple = tuple(hex_gen) | |
assert len(hex_tuple)==256 | |
xin = u'新' | |
xin_unicode_point = '65B0' | |
#In Python, can lign up equalities and transitive (?) property means all are the same | |
assert ord(xin)==int(xin_unicode_point, 16) == 26032 | |
import string | |
digs = string.digits + string.lowercase | |
def int2base(x, base): | |
"""http://stackoverflow.com/a/2267446/1175496""" | |
if x < 0: sign = -1 | |
elif x==0: return '0' | |
else: sign = 1 | |
x *= sign | |
digits = [] | |
while x: | |
digits.append(digs[x % base]) | |
x /= base | |
if sign < 0: | |
digits.append('-') | |
digits.reverse() | |
return ''.join(digits) | |
def encode(string, dr_yang=False): | |
""" | |
Encode a string to the "x-www-form-urlencoded" form, enhanced | |
with the UTF-8-in-URL proposal. This is what happens: | |
<ul> | |
<li><p>The ASCII characters 'a' through 'z', 'A' through 'Z', | |
and '0' through '9' remain the same. | |
<li><p>The unreserved characters - _ . ! ~ ' ( ) remain the same. | |
<li><p>The space character ' ' is converted into a plus sign '+'. | |
<li><p>All other ASCII characters are converted into the | |
3-character string "%xy", where xy is | |
the two-digit hexadecimal representation of the character | |
code | |
<li><p>All non-ASCII characters are encoded in two steps: first | |
to a sequence of 2 or 3 bytes, using the UTF-8 algorithm; | |
secondly each of these bytes is encoded as "%xx". | |
</ul> | |
@param s The string to be encoded | |
@return The encoded string""" | |
sbuf = [] | |
for char in string: | |
#In Java, chr can be stored in int http://docs.oracle.com/javase/specs/jls/se7/html/jls-5.html jls-5.1.3 | |
char_int = ord(char) | |
if ( ord('A') <= char_int <= ord('Z') | |
or ord('a') <= char_int <= ord('z') | |
or ord('0') <= char_int <= ord('0')): | |
sbuf+= char | |
elif char == ' ': | |
sbuf+='+' | |
elif char in '-_.!~ \()': | |
sbuf+= char | |
"""All non-ASCII characters are encoded in two steps: first | |
to a sequence of 2 or 3 bytes, using the UTF-8 algorithm; | |
secondly each of these bytes is encoded as "%xx".""" | |
#http://www.herongyang.com/Unicode/UTF-8-UTF-8-Encoding-Algorithm.html | |
#http://www.herongyang.com/Unicode/UTF-8-UTF-8-Encoding.html | |
utf8_algorithm = """ | |
1) | |
If a code point is the U+0000...U+007F range, it can be viewed as a 7-bit integer, 0bxxxxxxx. | |
Map the code point into 1 byte with the first high order bit set to 0 as: B1 = 0b0xxxxxx. | |
2) | |
If a code point is the U+0080...U+07FF range, it can be viewed as a 11-bit integer, 0byyyyyxxxxxx. | |
Map the code point into 2 bytes with first 5 bits stored in the first byte | |
and last 6 bits in the second byte: as: B1 = 0b110yyyyy, B2 = 0b10xxxxxx. | |
3) | |
If a code point is the U+0800...U+FFFF range, it can be viewed as a 16-bit integer, 0bzzzzyyyyyyxxxxxx. | |
Map the code point into 3 bytes with first 4 bits stored in the first byte, next 6 bits in the second byte, | |
and last 6 bits in the third byte: as: B1 = 0b1110zzzz, B2 = 0b10yyyyyy, B3 = 0b10xxxxxx. | |
4) | |
If a code point is the U+10000...U+10FFFF range, it can be viewed as a 21-bit integer, 0bvvvzzzzzzyyyyyyxxxxxx. | |
Map the code point into 4 bytes with first 3 bits stored in the first byte, next 6 bits in the second byte, | |
another 6 bits in the third byte, | |
and last 6 bits in the fourth byte: as: B1 = 0b11110xxx, B2 = 0b10zzzzzz, B3 = 0b10yyyyyy, B4 = 0b10xxxxxx. | |
Copyright © 2014 by Dr. Herong Yang. All rights reserved.""" | |
utf8_1 = len(int2base(0x0000,2)) == 1 and len(int2base(0x007F, 2)) == 7 | |
utf8_2 = len(int2base(0x0080,2)) == 8 and len(int2base(0x07FF, 2)) == 11 | |
utf8_3 = len(int2base(0x0800,2)) == 12 and len(int2base(0xFFFF, 2)) == 16 | |
utf8_4 = len(int2base(0x10000,2)) == 17 and len(int2base(0x10FFFF, 2)) == 21 | |
#The four buckets described above; that their bit representations are possible | |
assert utf8_1 and utf8_2 and utf8_3 and utf8_4 | |
utf8_2_1_filler = int(int2base(6,2)) == 110 | |
utf8_2_1_filler = int(int2base(0xc0,2)) == 110 * 10**5 | |
utf8_2_2_filler = int(int2base(0x80,2)) == 10 * 10**6 #Also the fillter for 3_2, 3_3, 3_2, _4_2, 4_3, and 4_4 | |
elif char_int <= 0x007f: | |
#TODO: Understand Dr. Yang's algorithm (not working, uncommented) | |
#vs Java algorithm (failing, commented) | |
if dr_yang: | |
sbuf+= hex_tuple[char_int>>6 & 0x1F | 0xC0] | |
sbuf+= hex_tuple[char_int>>0 & 0x3F | 0x80] | |
else: | |
sbuf+=hex_tuple[0xc0 | (char_int >> 6)] | |
sbuf+=hex_tuple[0x80 | (char_int & 0x3F)] | |
else: | |
if dr_yang: | |
sbuf+=hex_tuple[char_int>>12 & 0x0F | 0xE0] | |
sbuf+=hex_tuple[char_int>>6 & 0x3F | 0x80] | |
sbuf+=hex_tuple[char_int>>0 & 0x3F | 0x80] | |
else: | |
sbuf+=hex_tuple[0xe0 | (char_int >> 12)] | |
sbuf+=hex_tuple[0x80 | ((char_int >> 6) & 0x3F)] | |
sbuf+=hex_tuple[0x80 | (char_int & 0x3F)] | |
return ''.join(sbuf) | |
exp_ints = ('%E6','%96','%B0') | |
exp_enc_xin = ''.join(exp_ints) | |
enc_xin = encode(xin) | |
assert enc_xin.lower() == exp_enc_xin.lower(), '{0} is not {1}'.format(enc_xin, exp_enc_xin) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment