Last active
August 29, 2015 14:07
-
-
Save wjt/4fdc0788caf386e82ce6 to your computer and use it in GitHub Desktop.
Calling C functions which accept UTF-16 as a 0-terminated array of int16 using ctypes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "csa.h" | |
#include <iconv.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
char * | |
from_short_array(uint16_t *shorts) | |
{ | |
int i, n = 0; | |
char *utf_16_le, *utf_16_le_; | |
iconv_t utf_16le_to_utf8; | |
char *utf8, *utf8_; | |
size_t ret, inbytesleft, outbytesleft; | |
for (n = 0; shorts[n] != 0; n++); | |
utf_16_le = malloc(2 * (n + 1)); | |
for (i = 0; i <= n; i++) | |
{ | |
uint16_t ch = shorts[i]; | |
utf_16_le[2 * i] = ch / 0x100; | |
utf_16_le[2 * i + 1] = ch % 0x100; | |
} | |
utf_16le_to_utf8 = iconv_open("UTF-8", "UTF-16LE"); | |
if (utf_16le_to_utf8 == (iconv_t) -1) { | |
perror("iconv_open"); | |
return NULL; | |
} | |
/* Non-surrogate characters in 'shorts' are at most U+FFFF which fits in at | |
* most 3 bytes in UTF-8. Surrogate pairs fit in at most 6 bytes of UTF-8, | |
* and are two elements of 'shorts'. So this is enough space to hold the full | |
* string. | |
*/ | |
utf8 = malloc(3 * n + 1); | |
utf8_ = utf8; | |
outbytesleft = 3 * n; | |
utf_16_le_ = utf_16_le; | |
inbytesleft = 2 * n; | |
ret = iconv(utf_16le_to_utf8, &utf_16_le_, &inbytesleft, &utf8_, &outbytesleft); | |
if (ret == -1) | |
{ | |
perror("iconv"); | |
return NULL; | |
} | |
free(utf_16_le); | |
*utf8_ = '\0'; | |
return utf8; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
char *from_short_array(uint16_t *shorts); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
all: csa.so | |
csa.so: csa.c csa.h | |
gcc -fPIC -shared -Wall -Werror -o $@ $< |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# vim: set fileencoding=utf-8 | |
import ctypes | |
def to_uint16_array(string): | |
utf_16 = string.encode('utf_16_le') | |
assert len(utf_16) % 2 == 0 | |
uint16s = [ | |
ord(utf_16[i]) * 0x100 + ord(utf_16[i+1]) | |
for i in range(0, len(utf_16), 2) | |
] | |
uint16s.append(0) | |
return uint16s | |
_csa = ctypes.CDLL('./csa.so') | |
_csa.from_short_array.restype = ctypes.c_char_p | |
# _csa.from_short_array.argtypes = [ ??? ] | |
# | |
# One tempting option is to use ctypes.c_char_p, and pass in | |
# string.encode('utf_16_le'), but this is wrong: it only adds a single byte of | |
# NUL-terminator. | |
def test_roundtrip(string): | |
u16a = to_uint16_array(string) | |
arg = (ctypes.c_uint16 * len(u16a))(*u16a) | |
ret = _csa.from_short_array(arg) | |
ret_u = ret.decode('utf-8') | |
assert string == ret_u, (repr(string), repr(ret_u)) | |
cases = [ | |
u'Hello world', | |
u'Héllo world', | |
u'Hello 😼', | |
] | |
def go(): | |
for case in cases: | |
test_roundtrip(case) | |
if __name__ == '__main__': | |
go() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment