authors | tags | description | keywords | image | date | draft | enableComments | ||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
Python Unicode |
|
2023-12-13 |
false |
true |
Python string use unicodeobject
which is implemented in C
in CPython:
Python chooses one of these three kinds of data type to internally represent for a Unicode-characters string, so every Unicode character of the string has the same fixed-length: 1, 2 or 4,
- UCS-1(1 byte), for ASCII characters between U+0000 and U+00FF
- UCS-2(2 bytes), for Unicode characters between U+00FF and U+FFFF
- UCS-4(4 bytes), for Unicode characters between U+00FFFF and U+10FFFF
There are 4 forms of Unicode strings:
- compact ascii
- compact
- legacy string
- legacy string ready
The legacy string
as I see is already deprecated after Python3 and the official says: "it will be removed after Python 4".
The compact
make the characters
data start just after the PyASCIIObject
or PyCompactUnicodeObject
structure, using just one block, whereas legacy strings
use one block for the structure and one block for characters
.
The characters
data are 1-byte, 2-byte or 4-byte code point
. The UnicodeObject will use the maximum length in fixed-size for each.
- Indexing into strings in Python is operated in a constant time, as it's based on the fixed-length encodings.
- Go
- Iterating yields Unicode code point
- Indexing yields a byte
- Rust
- Iteration yields Unicode code point(
method.chars()
) or byte(method.bytes
) - Indexing not supported
- Iteration yields Unicode code point(
- Python
- Iterating yields Unicode code point
- Indexing yields Unicode code point
References are mainly from:
A brief PyUnicodeObject
structure defined from unicodeobject.h:
typedef struct {
Py_ssize_t ob_refcnt;
PyTypeObject *ob_type;
} PyObject
/* --- Unicode Type ------------------------------------------------------- */
typedef struct {
/* There are 4 forms of Unicode strings:
- compact ascii:
* structure = PyASCIIObject
* test: PyUnicode_IS_COMPACT_ASCII(op)
* kind = PyUnicode_1BYTE_KIND
* compact = 1
* ascii = 1
* ready = 1
* (length is the length of the utf8 and wstr strings)
* (data starts just after the structure)
* (since ASCII is decoded from UTF-8, the utf8 string are the data)
- compact:
* structure = PyCompactUnicodeObject
* test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
PyUnicode_4BYTE_KIND
* compact = 1
* ready = 1
* ascii = 0
* utf8 is not shared with data
* utf8_length = 0 if utf8 is NULL
* wstr is shared with data and wstr_length=length
if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
* wstr_length = 0 if wstr is NULL
* (data starts just after the structure)
- legacy string, not ready:
* structure = PyUnicodeObject
* test: kind == PyUnicode_WCHAR_KIND
* length = 0 (use wstr_length)
* hash = -1
* kind = PyUnicode_WCHAR_KIND
* compact = 0
* ascii = 0
* ready = 0
* interned = SSTATE_NOT_INTERNED
* wstr is not NULL
* data.any is NULL
* utf8 is NULL
* utf8_length = 0
- legacy string, ready:
* structure = PyUnicodeObject structure
* test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
PyUnicode_4BYTE_KIND
* compact = 0
* ready = 1
* data.any is not NULL
* utf8 is shared and utf8_length = length with data.any if ascii = 1
* utf8_length = 0 if utf8 is NULL
* wstr is shared with data.any and wstr_length = length
if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
* wstr_length = 0 if wstr is NULL
Compact strings use only one memory block (structure + characters),
whereas legacy strings use one block for the structure and one block
for characters.
Legacy strings are created by PyUnicode_FromUnicode() and
PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
when PyUnicode_READY() is called.
See also _PyUnicode_CheckConsistency().
*/
PyObject_HEAD
Py_ssize_t length; /* Number of code points in the string */
Py_hash_t hash; /* Hash value; -1 if not set */
struct {
/*
SSTATE_NOT_INTERNED (0)
SSTATE_INTERNED_MORTAL (1)
SSTATE_INTERNED_IMMORTAL (2)
If interned != SSTATE_NOT_INTERNED, the two references from the
dictionary to this object are *not* counted in ob_refcnt.
*/
unsigned int interned:2;
/* Character size:
- PyUnicode_WCHAR_KIND (0):
* character type = wchar_t (16 or 32 bits, depending on the
platform)
- PyUnicode_1BYTE_KIND (1):
* character type = Py_UCS1 (8 bits, unsigned)
* all characters are in the range U+0000-U+00FF (latin1)
* if ascii is set, all characters are in the range U+0000-U+007F
(ASCII), otherwise at least one character is in the range
U+0080-U+00FF
- PyUnicode_2BYTE_KIND (2):
* character type = Py_UCS2 (16 bits, unsigned)
* all characters are in the range U+0000-U+FFFF (BMP)
* at least one character is in the range U+0100-U+FFFF
- PyUnicode_4BYTE_KIND (4):
* character type = Py_UCS4 (32 bits, unsigned)
* all characters are in the range U+0000-U+10FFFF
* at least one character is in the range U+10000-U+10FFFF
*/
unsigned int kind:3;
/* Compact is with respect to the allocation scheme. Compact unicode
objects only require one memory block while non-compact objects use
one block for the PyUnicodeObject struct and another for its data
buffer. */
unsigned int compact:1;
/* The string only contains characters in the range U+0000-U+007F (ASCII)
and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
set, use the PyASCIIObject structure. */
unsigned int ascii:1;
/* The ready flag indicates whether the object layout is initialized
completely. This means that this is either a compact object, or
the data pointer is filled out. The bit is redundant, and helps
to minimize the test in PyUnicode_IS_READY(). */
unsigned int ready:1;
/* Padding to ensure that PyUnicode_DATA() is always aligned to
4 bytes (see issue #19537 on m68k). */
unsigned int :24;
} state;
wchar_t *wstr; /* wchar_t representation (null-terminated) */
} PyASCIIObject;
/* Non-ASCII strings allocated through PyUnicode_New use the
PyCompactUnicodeObject structure. state.compact is set, and the data
immediately follow the structure. */
typedef struct {
PyASCIIObject _base;
Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
* terminating \0. */
char *utf8; /* UTF-8 representation (null-terminated) */
Py_ssize_t wstr_length; /* Number of code points in wstr, possible
* surrogates count as two code points. */
} PyCompactUnicodeObject;
/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
PyUnicodeObject structure. The actual string data is initially in the wstr
block, and copied into the data block using _PyUnicode_Ready. */
typedef struct {
PyCompactUnicodeObject _base;
union {
void *any;
Py_UCS1 *latin1;
Py_UCS2 *ucs2;
Py_UCS4 *ucs4;
} data; /* Canonical, smallest-form Unicode buffer */
} PyUnicodeObject;
As it's known that each Unicode character in string is represented by a Unicode code point. In PyUnicodeObject
, these code points are the encoding saved in the data
, so PyUnicodeObject
does not use the UTF-8
encoding in the data
.
Invokes several internal C functions in such a sequence generally,
PyObject *PyUnicode_FromStringAndSize(const char *str, Py_ssize_t size)
PyObject *PyUnicode_DecodeUTF8Stateful(const char *str, Py_ssize_t size, const char *errors, Py_ssize_t *consumed)
static PyObject *unicode_decode_utf8(const char *s, Py_ssize_t size, _Py_error_handler error_handler, const char *errors, Py_ssize_t *consumed)
PyObject *PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
static Py_ssize_t ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
// ucs2lib.h
#define STRINGLIB(F) ucs2lib_##F
STRINGLIB(utf8_decode)(const char **inptr, const char *end,
STRINGLIB_CHAR *dest,
Py_ssize_t *outpos)
Let's examine the internal data struct of a string object in modern Python 3.
:::NOTE
You keep the character being referred otherwise the GC
may release that memory,
Define the layout mapping unicodeobject
,
import ctypes
# It's recommended to go to see [python 3.10 unicodeobject.h](https://github.com/python/cpython/blob/3.10/Include/cpython/unicodeobject.h#L85-L244)
class PyASCIIObject(ctypes.Structure):
# internal fields of the string object
_fields_ = [
("ob_refcnt", ctypes.c_long),
("ob_type", ctypes.c_void_p),
("length", ctypes.c_ssize_t),
("hash", ctypes.c_ssize_t),
("interned", ctypes.c_uint, 2),
("kind", ctypes.c_uint, 3),
("compact", ctypes.c_uint, 1),
("ascii", ctypes.c_uint, 1),
("ready", ctypes.c_uint, 1),
("_padding", ctypes.c_uint, 24),
("wstr", ctypes.POINTER(ctypes.c_wchar))
]
def __repr__(self) -> str:
return f"ob_refcnt[{self.ob_refcnt}], length[{self.length}], interned[{self.interned}], kind[{self.kind}], compact[{self.compact}], ascii[{self.ascii}], ready[{self.ready}]"
class PyCompactUnicodeObject(PyASCIIObject):
# internal fields of the string object
_fields_ = [
("utf8_length", ctypes.c_ssize_t),
("utf8", ctypes.POINTER(ctypes.c_char)),
("wstr_length", ctypes.c_ssize_t),
]
def __repr__(self) -> str:
return super().__repr__() + f" utf8_length[{self.utf8_length}], utf8[{self.utf8}], wstr_length[{self.wstr_length}]"
class PyUnicodeObject(PyCompactUnicodeObject):
class _Data(ctypes.Union):
_fields_ = [
("any", ctypes.c_void_p),
("latin1", ctypes.POINTER(ctypes.c_uint8)),
("ucs2", ctypes.POINTER(ctypes.c_uint16)),
("ucs4", ctypes.POINTER(ctypes.c_uint32)),
]
_fields_ = [
("data", _Data),
]
- Type: compact ascii. Key fields: kind[1], compact[1], ascii[1], ready[1]
>>> string_obj = "Hello, ctypes!"
>>> addr = id(string_obj)
>>> ascii_obj = PyASCIIObject.from_address(addr)
>>> print(ascii_obj)
ob_refcnt[1], length[14], interned[0], kind[1], compact[1], ascii[1], ready[1]
>>>
>>> # compact ascii: data starts just after the structure
>>> data_addr = addr + ctypes.sizeof(PyASCIIObject)
>>> data = ctypes.cast(data_addr, ctypes.c_char_p)
>>> print(f"data: {data.value}")
data: b'Hello, ctypes!'
- Type: compact
UCS-2
. Key fields: kind[1], compact[1], ascii[1], ready[1]
>>> string_obj = "你好!"
>>> addr = id(string_obj)
>>> ascii_obj = PyASCIIObject.from_address(addr)
>>> print(ascii_obj)
ob_refcnt[1], length[3], interned[0], kind[2], compact[1], ascii[0], ready[1]
>>>
>>> compact_obj = PyCompactUnicodeObject.from_address(addr)
>>> print(compact_obj)
ob_refcnt[1], length[3], interned[0], kind[2], compact[1], ascii[0], ready[1] utf8_length[0], utf8[<ctypes.LP_c_char object at 0x7f0c29297ac
0>], wstr_length[0]
>>>
>>> # compact: data starts just after the structure
>>> data_addr = addr + ctypes.sizeof(PyCompactUnicodeObject)
>>> data = ctypes.cast(data_addr, ctypes.POINTER(ctypes.c_uint16))
>>> print(f"data: {data[0]}, {data[0]:#06x}, {chr(data[0])}")
data: 20320, 0x4f60, 你
>>> print(f"data: {data[1]}, {data[1]:#06x}, {chr(data[1])}")
data: 22909, 0x597d, 好
>>> print(f"data: {data[2]}, {data[2]:#06x}, {chr(data[2])}")
data: 33, 0x0021, !
>>> print(f"data: {data[3]}, {data[3]:#06x}, {chr(data[3])}")
data: 0, 0x0000,
- Type: compact
UCS-4
. Key fields: kind[4], compact[1], ascii[1], ready[1]
>>> string_obj = "你好🤨"
>>> addr = id(string_obj)
>>> ascii_obj = PyASCIIObject.from_address(addr)
>>> print(ascii_obj)
ob_refcnt[1], length[3], interned[0], kind[4], compact[1], ascii[0], ready[1]
>>>
>>> compact_obj = PyCompactUnicodeObject.from_address(addr)
>>> print(compact_obj)
ob_refcnt[1], length[3], interned[0], kind[4], compact[1], ascii[0], ready[1] utf8_length[0], utf8[<ctypes.LP_c_char object at 0x7f0c292b1ac
0>], wstr_length[3]
>>>
>>> # compact: data starts just after the structure
>>> data_addr = addr + ctypes.sizeof(PyCompactUnicodeObject)
>>> data = ctypes.cast(data_addr, ctypes.POINTER(ctypes.c_uint32))
>>> print(f"data: {data[0]}, {data[0]:#010x}, {chr(data[0])}")
data: 20320, 0x00004f60, 你
>>> print(f"data: {data[1]}, {data[1]:#010x}, {chr(data[1])}")
data: 22909, 0x0000597d, 好
>>> print(f"data: {data[2]}, {data[2]:#010x}, {chr(data[2])}")
data: 129320, 0x0001f928, 🤨
>>> print(f"data: {data[3]}, {data[3]:#010x}, {chr(data[3])}")
data: 0, 0x00000000,
- Type: legacy string. Key fields: kind[2], compact[0], ascii[0]
I can't produce it in Python3.10, maybe you can try python2.7.
All codes are at object layout
How Python saves memory when storing strings | Artem Golubin
Python behind the scenes #9: how Python strings work