|
############################################################################ |
|
# |
|
# File: utf16letoutf8.icn |
|
# |
|
# Subject: An Icon tool to convert UTF-16LE to UTF-8 |
|
# |
|
# Author: Arthur Eschenlauer (https://orcid.org/0000-0002-2882-0508) |
|
# |
|
# Date: 5 May, 2020 |
|
# |
|
# URL: https://gist.github.com/eschen42/ed5d727e21a42a9b675e38186018fa47 |
|
# |
|
############################################################################ |
|
# |
|
# This program provides a native Icon implementation a UTF-16LE to UTF-8 |
|
# converter. This provides an alternative to calling out to iconv, which |
|
# may be available on some platforms (Unix) but not others (Windows). |
|
# |
|
# It provides two procedures that may be adapted to taste: |
|
# |
|
# codepoint2utf8(codepoint) - compute utf8 from Unicode codepoint |
|
# |
|
# utf16le2utf8(Lproducer) - compute utf8 from a list of producer |
|
# co-expreessions that produce strings that are |
|
# in fact even-numbered byte lengths of sequential |
|
# substrings of a little-endian UTF-16 string. |
|
# |
|
# main(args) - demonstrate "programmer-defined control structures" style |
|
# invocation of for a UTF-16LE string of, if args are supplied |
|
# naming input files, UTF-16LE files |
|
# |
|
# This code was adapted from the descriptions and examples at: |
|
# - https://en.wikipedia.org/wiki/UTF-16#Examples |
|
# - UTF-16LE to Unicode codepoint |
|
# - https://stackoverflow.com/a/42013433 |
|
# - Unicode codepoint to UTF-8 |
|
# |
|
# Credit for all flaws belongs to the author of this file. |
|
# |
|
############################################################################ |
|
# |
|
# Requires: co-expressions |
|
# |
|
############################################################################ |
|
# |
|
# Links: printf (for tracing only) |
|
# |
|
############################################################################ |
|
# |
|
# This file is in the public domain. Art Eschenlauer has waived all |
|
# copyright and related or neighboring rights to: |
|
# utf16letoutf8.icn - An Icon tool to convert UTF-16LE to UTF-8 |
|
# For details, see: |
|
# https://creativecommons.org/publicdomain/zero/1.0/ |
|
# |
|
# If you require a specific license and public domain status is not |
|
# sufficient for your needs, please substitute the MIT license, bearing |
|
# in mind that the copyright "claim" is solely to meet your requirements |
|
# and does not imply any restriction on use or copying by the author: |
|
# |
|
# Copyright (c) 2020, Arthur Eschenlauer |
|
# |
|
# Permission is hereby granted, free of charge, to any person obtaining |
|
# a copy of this software and associated documentation files (the |
|
# "Software"), to deal in the Software without restriction, including |
|
# without limitation the rights to use, copy, modify, merge, publish, |
|
# distribute, sublicense, and/or sell copies of the Software, and to |
|
# permit persons to whom the Software is furnished to do so, subject |
|
# to the following conditions: |
|
# |
|
# The above copyright notice and this permission notice shall be |
|
# included in all copies or substantial portions of the Software. |
|
# |
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
|
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
|
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
|
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
|
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
|
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|
# SOFTWARE. |
|
# |
|
############################################################################ |
|
|
|
# ref: https://en.wikipedia.org/wiki/UTF-16#Examples |
|
# - UTF-16LE to Unicode codepoint |
|
# ref: https://stackoverflow.com/a/42013433 |
|
# - Unicode codepoint to UTF-8 |
|
|
|
link printf # for tracing only |
|
|
|
global our_trace |
|
|
|
procedure codepoint2utf8( |
|
codepoint # 32 bit unicode codepoint |
|
) |
|
local my_trace |
|
my_trace := our_trace # set to &null to suppress tracing |
|
# Code adapted from https://stackoverflow.com/a/42013433 |
|
fprintf(\my_trace & &errout,"codepoint2utf8: codepoint 16r%08x\n",codepoint) |
|
|
|
# if (code <= 0x7F) { |
|
# buffer[0] = code; |
|
# return 1; |
|
# } |
|
if 16r7F < codepoint then write(\my_trace & &errout,"codepoint2utf8: not one-byte\n") |
|
if 16r7F >= codepoint |
|
then return char(codepoint) |
|
# if (code <= 0x7FF) { |
|
# buffer[0] = 0xC0 | (code >> 6); /* 110xxxxx */ |
|
# buffer[1] = 0x80 | (code & 0x3F); /* 10xxxxxx */ |
|
# return 2; |
|
# } |
|
if 16r7FF < codepoint then write(\my_trace & &errout,"codepoint2utf8: not two-bytes\n") |
|
if 16r7FF >= codepoint then return ( |
|
char(ior(16rC0, ishift(codepoint,-6))) || |
|
char(ior(16r80,iand(codepoint,16r3F))) |
|
) |
|
# if (code <= 0xFFFF) { |
|
# buffer[0] = 0xE0 | (code >> 12); /* 1110xxxx */ |
|
# buffer[1] = 0x80 | ((code >> 6) & 0x3F); /* 10xxxxxx */ |
|
# buffer[2] = 0x80 | (code & 0x3F); /* 10xxxxxx */ |
|
# return 3; |
|
# } |
|
if 16rFFFF < codepoint then write(\my_trace & &errout,"codepoint2utf8: not three-bytes\n") |
|
if 16rFFFF >= codepoint then return ( |
|
char(ior(16rE0, ishift(codepoint,-12))) || |
|
char(ior(16r80,iand(ishift(codepoint,-6),16r3F))) || |
|
char(ior(16r80,iand(codepoint,16r3F))) |
|
) |
|
# if (code <= 0x10FFFF) { |
|
# buffer[0] = 0xF0 | (code >> 18); /* 11110xxx */ |
|
# buffer[1] = 0x80 | ((code >> 12) & 0x3F); /* 10xxxxxx */ |
|
# buffer[2] = 0x80 | ((code >> 6) & 0x3F); /* 10xxxxxx */ |
|
# buffer[3] = 0x80 | (code & 0x3F); /* 10xxxxxx */ |
|
# return 4; |
|
# } |
|
if 16r10FFFF < codepoint then write(\my_trace & &errout,"codepoint2utf8: not four-bytes\n") |
|
if 16r10FFFF >= codepoint then return ( |
|
char(ior(16rF0, ishift(codepoint,-18))) || |
|
char(ior(16r80,iand(ishift(codepoint,-12),16r3F))) || |
|
char(ior(16r80,iand(ishift(codepoint,-6),16r3F))) || |
|
char(ior(16r80,iand(codepoint,16r3F))) |
|
) |
|
end |
|
|
|
procedure utf16le2utf8( |
|
Lproducer # list of producer co-expressions that produce strings |
|
# that are in fact even-numbered byte lengths of a |
|
# little-endian UTF16 string |
|
) |
|
# Code adapted from https://en.wikipedia.org/wiki/UTF-16#Examples |
|
|
|
# N.B.: ishift is left-shift for positive values of arg 2 |
|
# and right-shift for negative values of arg 2 |
|
|
|
local lead, trail, utf16, utf16lo, utf16hi, producer, line, hiword, codepoint, utf8 |
|
static my_trace, cset_nonzero |
|
initial { |
|
my_trace := our_trace # set to &null to turn off tracing |
|
cset_nonzero := &cset[2:0] |
|
} |
|
hiword := 0 |
|
# For each producer co-expression, suspend the characters |
|
# in the corresponding UTF8 string. |
|
every producer := !Lproducer do { |
|
while line := @producer do { |
|
# While it is possible to read the little-endian words from the input string |
|
line ? while (utf16lo := ord(move(1)), utf16hi := ord(move(1))) do { |
|
# Construct a word from the butes read |
|
utf16 := ishift(utf16hi,8) + utf16lo |
|
# Check for the byte-order mark in the first word for |
|
# the first string produced by the producer |
|
if *producer = 1 & &pos = 3 & utf16 = 16rFEFF |
|
then next |
|
# Because this code is not designed for big-endian UTF-16, |
|
# abort production if the byte-order mark identifies this as such. |
|
if *producer = 1 & &pos = 3 & utf16 = 16rFFFE |
|
then break |
|
# Check for a lead surrogate (used for ten-byte Unicode code points); |
|
# - See: https://en.wikipedia.org/wiki/UTF-16#Examples |
|
# - Note that this check is bypassed when the preceding word is a |
|
# lead surrogate |
|
if hiword = 0 & 16rD800 <= utf16 <= 16rDFFF then { |
|
# Save lead surrogate |
|
hiword := utf16 |
|
fprintf(\my_trace & &errout,"hiword %4x\n",hiword) |
|
# Next get the tail surrogate |
|
next |
|
} |
|
# At this point, either this is not a surrogate or we have both lead and tail surrogates. |
|
|
|
# These statements are only for tracing |
|
fprintf(\my_trace & &errout,"utf16 %4x\n",utf16) |
|
if 0 < hiword then { |
|
fprintf(\my_trace & &errout," hiword - 16rD800 16r%04x\n", hiword - 16rD800 ) |
|
fprintf(\my_trace & &errout,"ishift((hiword) - 16rD800,10) 16r%04x\n",ishift((hiword) - 16rD800,10)) |
|
fprintf(\my_trace & &errout,"utf16 - 16rDC00 16r%04x\n",utf16 - 16rDC00) |
|
} |
|
# If we have surrogates, hiword > 0, so compute the Unicode codepoint; |
|
# otherwise, utf16 is the codepoint |
|
codepoint := ( ishift((0<hiword) - 16rD800,10) + utf16 + 16r10000 - 16rDC00 ) | utf16 |
|
fprintf(\my_trace & &errout,"codepoint 16r%08x\n",codepoint) |
|
# Convert the codepoint to utf8 |
|
if utf8 := codepoint2utf8(codepoint) then { |
|
write(\my_trace & &errout,"image of utf8 is ",image(utf8)) |
|
suspend utf8 |
|
} |
|
# reset lead surrogate (This has no effect when last word was not tail surrogete.) |
|
hiword := 0 |
|
} |
|
} |
|
} |
|
end |
|
|
|
procedure main(args) |
|
local line, f, foo, chunk, source, result |
|
our_trace := &null # set to non-&null to turn on tracing statements |
|
#every write(&features) |
|
if *args = 0 then { |
|
line := char(16rFF)||char(16rFE)|| |
|
char(16r01)||char(16rD8)||char(16r37)||char(16rDC)|| # Unicode codepoint 10437 |
|
char(16r52)||char(16rD8)||char(16r62)||char(16rDF)|| # Unicode codepoint 24b62 |
|
char(16r32)||char(16r00) # Unicode coodepint 32 |
|
result := "" |
|
every chunk := utf16le2utf8{line} do { |
|
# This is notably slow. I assume that it is because Icon strings are |
|
# immutable, so it's an order N^2 charcter-copy operation. However, |
|
# that's all happening in the Icon string implementation, so I don't |
|
# expect its impact to be as noticeable as it is. |
|
result ||:= chunk |
|
} |
|
foo := open("foo.txt", "wu") |
|
write(foo, result) |
|
close(foo) |
|
} |
|
else { |
|
every f := !args do { |
|
f := open(f,"ru") |
|
# The following is much more complex than the otherwise more intuitive |
|
# chunk := reads(f,2) do { |
|
# if chunk ~== char(16rFF)||char(16rFE) |
|
# then writes(utf16le2utf8{chunk}) |
|
# } |
|
# while chunk := reads(f,512) do { |
|
# every writes(utf16le2utf8{chunk}) |
|
# } |
|
# This following code addresses this scenario: if a chunk boundary |
|
# splits a pair of UTF-16LE "surrogates" (i.e., two words |
|
# that represent a Unicode codepoint having more than 32 bits) |
|
# then it will be necessary to "drop" the first surrogate from |
|
# the preceding chunk and "carry" it by prepending it to the |
|
# next chunk. I would prefer not to do that here but rather |
|
# in the utf16le2utf8 procedure itself and then come back to |
|
# adjust the code here, but I have not yet determined a way to |
|
# do this succinctly. |
|
every writes( |
|
utf16le2utf8([ |
|
create { |
|
# Skip byte-order mark when found |
|
chunk := reads(f,2) |
|
if chunk ~== char(16rFF)||char(16rFE) then chunk @ &source |
|
# Produce chunks of the rest of the file |
|
# - Probably reading 4096 bytes would be more appropriate for |
|
# purposes other than demonstration. |
|
while chunk := reads(f,512) do chunk @ &source |
|
close(f) |
|
write(\our_trace & &errout,"argument to utf16le2utf8 complete") |
|
&fail |
|
} |
|
]) |
|
) |
|
} |
|
} |
|
end |
|
# vim: sw=2 ts=2 et ai nu ru syntax=icon : |