Skip to content

Instantly share code, notes, and snippets.

@wjt
Last active August 29, 2015 14:07
Show Gist options
  • Save wjt/4fdc0788caf386e82ce6 to your computer and use it in GitHub Desktop.
Save wjt/4fdc0788caf386e82ce6 to your computer and use it in GitHub Desktop.
Calling C functions which accept UTF-16 as a 0-terminated array of int16 using ctypes
#include "csa.h"
#include <iconv.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char *
from_short_array(uint16_t *shorts)
{
int i, n = 0;
char *utf_16_le, *utf_16_le_;
iconv_t utf_16le_to_utf8;
char *utf8, *utf8_;
size_t ret, inbytesleft, outbytesleft;
for (n = 0; shorts[n] != 0; n++);
utf_16_le = malloc(2 * (n + 1));
for (i = 0; i <= n; i++)
{
uint16_t ch = shorts[i];
utf_16_le[2 * i] = ch / 0x100;
utf_16_le[2 * i + 1] = ch % 0x100;
}
utf_16le_to_utf8 = iconv_open("UTF-8", "UTF-16LE");
if (utf_16le_to_utf8 == (iconv_t) -1) {
perror("iconv_open");
return NULL;
}
/* Non-surrogate characters in 'shorts' are at most U+FFFF which fits in at
* most 3 bytes in UTF-8. Surrogate pairs fit in at most 6 bytes of UTF-8,
* and are two elements of 'shorts'. So this is enough space to hold the full
* string.
*/
utf8 = malloc(3 * n + 1);
utf8_ = utf8;
outbytesleft = 3 * n;
utf_16_le_ = utf_16_le;
inbytesleft = 2 * n;
ret = iconv(utf_16le_to_utf8, &utf_16_le_, &inbytesleft, &utf8_, &outbytesleft);
if (ret == -1)
{
perror("iconv");
return NULL;
}
free(utf_16_le);
*utf8_ = '\0';
return utf8;
}
#include <stdint.h>
char *from_short_array(uint16_t *shorts);
all: csa.so
csa.so: csa.c csa.h
gcc -fPIC -shared -Wall -Werror -o $@ $<
# vim: set fileencoding=utf-8
import ctypes
def to_uint16_array(string):
utf_16 = string.encode('utf_16_le')
assert len(utf_16) % 2 == 0
uint16s = [
ord(utf_16[i]) * 0x100 + ord(utf_16[i+1])
for i in range(0, len(utf_16), 2)
]
uint16s.append(0)
return uint16s
_csa = ctypes.CDLL('./csa.so')
_csa.from_short_array.restype = ctypes.c_char_p
# _csa.from_short_array.argtypes = [ ??? ]
#
# One tempting option is to use ctypes.c_char_p, and pass in
# string.encode('utf_16_le'), but this is wrong: it only adds a single byte of
# NUL-terminator.
def test_roundtrip(string):
u16a = to_uint16_array(string)
arg = (ctypes.c_uint16 * len(u16a))(*u16a)
ret = _csa.from_short_array(arg)
ret_u = ret.decode('utf-8')
assert string == ret_u, (repr(string), repr(ret_u))
cases = [
u'Hello world',
u'Héllo world',
u'Hello 😼',
]
def go():
for case in cases:
test_roundtrip(case)
if __name__ == '__main__':
go()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment