wjt · August 29, 2015 14:07
diff --git a/csa.c b/csa.c
 #include "csa.h"

 #include <iconv.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 char *
 from_short_array(uint16_t *shorts)
 {
  int i, n = 0;
  char *utf_16_le, *utf_16_le_;
  iconv_t utf_16le_to_utf8;
  char *utf8, *utf8_;
  size_t ret, inbytesleft, outbytesleft;

  for (n = 0; shorts[n] != 0; n++);

  utf_16_le = malloc(2 * (n + 1));
  for (i = 0; i <= n; i++)
    {
      uint16_t ch = shorts[i];

      utf_16_le[2 * i] = ch / 0x100;
      utf_16_le[2 * i + 1] = ch % 0x100;
    }

  utf_16le_to_utf8 = iconv_open("UTF-8", "UTF-16LE");
  if (utf_16le_to_utf8 == (iconv_t) -1) {
    perror("iconv_open");
    return NULL;
  }

  /* Non-surrogate characters in 'shorts' are at most U+FFFF which fits in at
   * most 3 bytes in UTF-8. Surrogate pairs fit in at most 6 bytes of UTF-8,
   * and are two elements of 'shorts'. So this is enough space to hold the full
   * string.
   */
  utf8 = malloc(3 * n + 1);
  utf8_ = utf8;
  outbytesleft = 3 * n;

  utf_16_le_ = utf_16_le;
  inbytesleft = 2 * n;

  ret = iconv(utf_16le_to_utf8, &utf_16_le_, &inbytesleft, &utf8_, &outbytesleft);
  if (ret == -1)
    {
      perror("iconv");
      return NULL;
    }

  free(utf_16_le);
  *utf8_ = '\0';
  return utf8;
 }
diff --git a/csa.h b/csa.h
 #include <stdint.h>

 char *from_short_array(uint16_t *shorts);
diff --git a/Makefile b/Makefile
 all: csa.so

 csa.so: csa.c csa.h
 	gcc -fPIC -shared -Wall -Werror -o $@ $<
diff --git a/usescsa.py b/usescsa.py
 # vim: set fileencoding=utf-8
 import ctypes

 def to_uint16_array(string):
    utf_16 = string.encode('utf_16_le')
    assert len(utf_16) % 2 == 0

    uint16s = [
        ord(utf_16[i]) * 0x100 + ord(utf_16[i+1])
        for i in range(0, len(utf_16), 2)
    ]
    uint16s.append(0)
    return uint16s

 _csa = ctypes.CDLL('./csa.so')
 _csa.from_short_array.restype = ctypes.c_char_p
 # _csa.from_short_array.argtypes = [ ??? ]
 #
 # One tempting option is to use ctypes.c_char_p, and pass in
 # string.encode('utf_16_le'), but this is wrong: it only adds a single byte of
 # NUL-terminator.

 def test_roundtrip(string):
    u16a = to_uint16_array(string)
    arg = (ctypes.c_uint16 * len(u16a))(*u16a)
    ret = _csa.from_short_array(arg)
    ret_u = ret.decode('utf-8')
    assert string == ret_u, (repr(string), repr(ret_u))

 cases = [
    u'Hello world',
    u'Héllo world',
    u'Hello 😼',
 ]

 def go():
    for case in cases:
        test_roundtrip(case)

 if __name__ == '__main__':
    go()
	#include "csa.h"

	#include <iconv.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	char *
	from_short_array(uint16_t *shorts)
	{
	int i, n = 0;
	char utf_16_le, utf_16_le_;
	iconv_t utf_16le_to_utf8;
	char utf8, utf8_;
	size_t ret, inbytesleft, outbytesleft;

	for (n = 0; shorts[n] != 0; n++);

	utf_16_le = malloc(2 * (n + 1));
	for (i = 0; i <= n; i++)
	{
	uint16_t ch = shorts[i];

	utf_16_le[2 * i] = ch / 0x100;
	utf_16_le[2 * i + 1] = ch % 0x100;
	}

	utf_16le_to_utf8 = iconv_open("UTF-8", "UTF-16LE");
	if (utf_16le_to_utf8 == (iconv_t) -1) {
	perror("iconv_open");
	return NULL;
	}

	/* Non-surrogate characters in 'shorts' are at most U+FFFF which fits in at
	* most 3 bytes in UTF-8. Surrogate pairs fit in at most 6 bytes of UTF-8,
	* and are two elements of 'shorts'. So this is enough space to hold the full
	* string.
	*/
	utf8 = malloc(3 * n + 1);
	utf8_ = utf8;
	outbytesleft = 3 * n;

	utf_16_le_ = utf_16_le;
	inbytesleft = 2 * n;

	ret = iconv(utf_16le_to_utf8, &utf_16_le_, &inbytesleft, &utf8_, &outbytesleft);
	if (ret == -1)
	{
	perror("iconv");
	return NULL;
	}

	free(utf_16_le);
	*utf8_ = '\0';
	return utf8;
	}
	#include <stdint.h>

	char from_short_array(uint16_t shorts);
	all: csa.so

	csa.so: csa.c csa.h
	gcc -fPIC -shared -Wall -Werror -o $@ $<
	# vim: set fileencoding=utf-8
	import ctypes

	def to_uint16_array(string):
	utf_16 = string.encode('utf_16_le')
	assert len(utf_16) % 2 == 0

	uint16s = [
	ord(utf_16[i]) * 0x100 + ord(utf_16[i+1])
	for i in range(0, len(utf_16), 2)
	]
	uint16s.append(0)
	return uint16s

	_csa = ctypes.CDLL('./csa.so')
	_csa.from_short_array.restype = ctypes.c_char_p
	# _csa.from_short_array.argtypes = [ ??? ]
	#
	# One tempting option is to use ctypes.c_char_p, and pass in
	# string.encode('utf_16_le'), but this is wrong: it only adds a single byte of
	# NUL-terminator.

	def test_roundtrip(string):
	u16a = to_uint16_array(string)
	arg = (ctypes.c_uint16 * len(u16a))(*u16a)
	ret = _csa.from_short_array(arg)
	ret_u = ret.decode('utf-8')
	assert string == ret_u, (repr(string), repr(ret_u))

	cases = [
	u'Hello world',
	u'Héllo world',
	u'Hello 😼',
	]

	def go():
	for case in cases:
	test_roundtrip(case)

	if __name__ == '__main__':
	go()