meijeru · October 30, 2011 12:08 · meijeru · Dec 29, 2011 · dockimbel · Dec 29, 2011
diff --git a/unicode-multiple.reds b/unicode-multiple.reds
 Red/System []

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; function utf8-to-cps (cps = codepoints)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ; This function decodes UTF-8 information supplied as bytes in argument u
 ; and uses the argument res to store the array of codepoints
 ; (integers >= 0  and < 10FFFFh); the space for this array should have been
 ; allocated by the caller; an upper limit for the size of the array in bytes
 ; is 4 times the length of the UTF-8 string. The actual size used is returned.
 ; For the algorithm, see http://en.wikipedia.org/wiki/UTF-8.

 ; Remarks about the contents:
 ; (1) Coding errors are skipped until the first correct UTF-8 combination
 ;     is encountered. The first byte of the offending combination is
 ;     replaced by a standard byte according to one of the several options
 ;     described in the article cited above and/or taken from elsewhere.
 ; (2) Codepoint U+0000 cannot be represented as UTF-8 single byte 00h
 ;     since that signifies end-of-c-string for Red/System!
 ;     However, this code point can be represented by C0h 80h.
 ; (3) As it stands, the function accepts "overlong" combinations, such as
 ;     C0h 80h or E0h 80h 80h or even F0h 80h 80h 80h for U+0000.
 ;     Uncomment the appropriate lines to test for this.
 ; (4) In any case, it rejects 4-byte sequences coding for 110000h and higher
 ;     (in fact, up to 1FFFFFh), because codepoints beyond U+10FFFF
 ;     are not defined in the Unicode standard.
 ; (5) Also, it rejects sequences resulting in the invalid Unicode codepoints
 ;     U+DC00..U+DFFF, which are used in UTF-16 for high and low surrogate halves.
 ;     It may, however, use some of these as replacement in the output.

 ; Remarks about the coding:
 ; (1) Another choice of type for the input parameter could have been
 ;     pointer! [byte!], but this would have meant that the length of the
 ;     input must be paassed as an extra parameter. It would have done
 ;     away with the problem of the null byte, though.
 ; (2) The choice of type for the output parameter is almost forced upon us,
 ;     since Red/System does not have native arrays.
 ; (3) The shift and bitwise or operators are assumed to be more efficient
 ;     than division and addition. They are also more closely tied to the
 ;     UTF-8 specs, reducing the possibility of errors.
 ; (4) The cases of invalid byte sequences include:
 ;     - an invalid starting byte
 ;     - an unexpected continuation byte
 ;     - a start byte not followed by enough continuation bytes (incomplete sequence)
 ;     - a sequence that decodes to a value that should use a shorter sequence
 ;      (an "overlong form").
 ;     The first case is caught by the test: unless b1 < C0h
 ;     The second case is caught by the tests: b2 >= 80h b2 < C0h etc.
 ;     The third case is effectively caught by the same tests, since
 ;     b2 >= 80h etc. will fail if b2 = 0 etc.
 ;     Thus there is no need for the equivalent of the protective REBOL clause
 ;                       if not tail? next u
 ;     etc.
 ;     The last case is also explicitly caught if the appropriate lines are
 ;     uncommented.
 ; (5) With a restriction to the BMP (codepoints up to U+FFFF) the resulting
 ;     array of codepoints could have elements that are uint16!, which would
 ;     save half of the space for this array.

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;




 #define code-array! [pointer! [integer!]]

 #define replacement
 ;	choose one of the following options
 	FFFDh			; U+FFFD = replacement character
 ;	1Ah				; U+001A = control SUB (substitute)
 ;	241Ah			; U+241A = symbol for substitute
 ;	2426h			; U+2426 = symbol for substitute form two
 ;	3Fh				; U+003F = question mark
 ;	BFh				; U+00BF = inverted question mark
 ;	DC00h + b1		; U+DCxx where xx = b1 (never a Unicode codepoint)

 utf8-to-cps: func [
 	u [c-string!]
 	res [code-array!]
 	return: [integer!]
 	/local
 		b1 b2 b3 b4		; up to four bytes in a UTF-8 sequence
 						; for computing purposes they are of integer! type
 		cp				; computed codepoint
 		res0			; start of result
 ][
 	res0: res
 	while [b1: as-integer u/1 b1 <> 0][

 		; cycling through res is done at the end; likewise for u
 		; to account for this, as soon as a multiple byte sequence is consumed
 		; the pointer in u is moved one less than the number of bytes consumed

 		either b1 < 80h					; single byte (ASCII)
 		[
 			res/value: b1				; and we are done
 		][
 			res/value: replacement
 			; assume error by default - this simplifies code greatly
 			; res/value is now only set if a correct sequence has been decoded

 			unless b1 < C0h [			; 80h - BFh may not start a sequence
 				case  [
 					b1 < E0h [			; start of two-byte sequence
 						b2: as-integer u/2
 						if all [
 							b2 >= 80h b2 < C0h
 						][
 							cp:	(b1 - C0h << 6) or
 								(b2 - 80h)
 ;							if any [
 ;								cp > 7Fh	; optional test for overlong
 ;								cp = 0		; even so, must allow U+0000
 ;							][
 								res/value: cp
 								u: u + 1
 ;							]
 						]
 					]
 					b1 < F0h [		; start of three-byte sequence
 						b2: as-integer u/2
 						b3: as-integer u/3
 						if all [
 							b2 >= 80h b2 < C0h
 							b3 >= 80h b3 < C0h
 						][
 							cp:	(b1 - E0h << 12) or
 								(b2 - 80h <<  6) or
 								(b3 - 80h)
 							if all [
 								any [cp < DC00h cp > DCFFh]
 ;								cp > 7FFh	; optional test for overlong
 							][
 								res/value: cp
 								u: u + 2
 							]
 						]
 					]
 					b1 < F8h [	; start of four-byte sequence
 						b2: as-integer u/2
 						b3: as-integer u/3
 						b4: as-integer u/4
 						if all [
 							b2 >= 80h b2 < C0h
 							b3 >= 80h b3 < C0h
 							b4 >= 80h b4 < C0h
 						][
 							cp:	(b1 - F0h << 18) or
 								(b2 - 80h << 12) or
 								(b3 - 80h <<  6) or
 								(b4 - 80h)
 							if all [
 								cp <= 10FFFFh
 ;								cp > FFFFh	; optional test for overlong
 							][
 								res/value: cp
 								u: u + 3
 							]
 						]
 					]
 ;					true [
 ;						error case
 ;					]
 				]
 			]
 		]
 		res: res + 1
 		u: u + 1
 	]
 	res - res0
 ]
	Red/System []

	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	; function utf8-to-cps (cps = codepoints)
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	; This function decodes UTF-8 information supplied as bytes in argument u
	; and uses the argument res to store the array of codepoints
	; (integers >= 0 and < 10FFFFh); the space for this array should have been
	; allocated by the caller; an upper limit for the size of the array in bytes
	; is 4 times the length of the UTF-8 string. The actual size used is returned.
	; For the algorithm, see http://en.wikipedia.org/wiki/UTF-8.

	; Remarks about the contents:
	; (1) Coding errors are skipped until the first correct UTF-8 combination
	; is encountered. The first byte of the offending combination is
	; replaced by a standard byte according to one of the several options
	; described in the article cited above and/or taken from elsewhere.
	; (2) Codepoint U+0000 cannot be represented as UTF-8 single byte 00h
	; since that signifies end-of-c-string for Red/System!
	; However, this code point can be represented by C0h 80h.
	; (3) As it stands, the function accepts "overlong" combinations, such as
	; C0h 80h or E0h 80h 80h or even F0h 80h 80h 80h for U+0000.
	; Uncomment the appropriate lines to test for this.
	; (4) In any case, it rejects 4-byte sequences coding for 110000h and higher
	; (in fact, up to 1FFFFFh), because codepoints beyond U+10FFFF
	; are not defined in the Unicode standard.
	; (5) Also, it rejects sequences resulting in the invalid Unicode codepoints
	; U+DC00..U+DFFF, which are used in UTF-16 for high and low surrogate halves.
	; It may, however, use some of these as replacement in the output.

	; Remarks about the coding:
	; (1) Another choice of type for the input parameter could have been
	; pointer! [byte!], but this would have meant that the length of the
	; input must be paassed as an extra parameter. It would have done
	; away with the problem of the null byte, though.
	; (2) The choice of type for the output parameter is almost forced upon us,
	; since Red/System does not have native arrays.
	; (3) The shift and bitwise or operators are assumed to be more efficient
	; than division and addition. They are also more closely tied to the
	; UTF-8 specs, reducing the possibility of errors.
	; (4) The cases of invalid byte sequences include:
	; - an invalid starting byte
	; - an unexpected continuation byte
	; - a start byte not followed by enough continuation bytes (incomplete sequence)
	; - a sequence that decodes to a value that should use a shorter sequence
	; (an "overlong form").
	; The first case is caught by the test: unless b1 < C0h
	; The second case is caught by the tests: b2 >= 80h b2 < C0h etc.
	; The third case is effectively caught by the same tests, since
	; b2 >= 80h etc. will fail if b2 = 0 etc.
	; Thus there is no need for the equivalent of the protective REBOL clause
	; if not tail? next u
	; etc.
	; The last case is also explicitly caught if the appropriate lines are
	; uncommented.
	; (5) With a restriction to the BMP (codepoints up to U+FFFF) the resulting
	; array of codepoints could have elements that are uint16!, which would
	; save half of the space for this array.

	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;




	#define code-array! [pointer! [integer!]]

	#define replacement
	; choose one of the following options
	FFFDh ; U+FFFD = replacement character
	; 1Ah ; U+001A = control SUB (substitute)
	; 241Ah ; U+241A = symbol for substitute
	; 2426h ; U+2426 = symbol for substitute form two
	; 3Fh ; U+003F = question mark
	; BFh ; U+00BF = inverted question mark
	; DC00h + b1 ; U+DCxx where xx = b1 (never a Unicode codepoint)

	utf8-to-cps: func [
	u [c-string!]
	res [code-array!]
	return: [integer!]
	/local
	b1 b2 b3 b4 ; up to four bytes in a UTF-8 sequence
	; for computing purposes they are of integer! type
	cp ; computed codepoint
	res0 ; start of result
	][
	res0: res
	while [b1: as-integer u/1 b1 <> 0][

	; cycling through res is done at the end; likewise for u
	; to account for this, as soon as a multiple byte sequence is consumed
	; the pointer in u is moved one less than the number of bytes consumed

	either b1 < 80h ; single byte (ASCII)
	[
	res/value: b1 ; and we are done
	][
	res/value: replacement
	; assume error by default - this simplifies code greatly
	; res/value is now only set if a correct sequence has been decoded

	unless b1 < C0h [ ; 80h - BFh may not start a sequence
	case [
	b1 < E0h [ ; start of two-byte sequence
	b2: as-integer u/2
	if all [
	b2 >= 80h b2 < C0h
	][
	cp: (b1 - C0h << 6) or
	(b2 - 80h)
	; if any [
	; cp > 7Fh ; optional test for overlong
	; cp = 0 ; even so, must allow U+0000
	; ][
	res/value: cp
	u: u + 1
	; ]
	]
	]
	b1 < F0h [ ; start of three-byte sequence
	b2: as-integer u/2
	b3: as-integer u/3
	if all [
	b2 >= 80h b2 < C0h
	b3 >= 80h b3 < C0h
	][
	cp: (b1 - E0h << 12) or
	(b2 - 80h << 6) or
	(b3 - 80h)
	if all [
	any [cp < DC00h cp > DCFFh]
	; cp > 7FFh ; optional test for overlong
	][
	res/value: cp
	u: u + 2
	]
	]
	]
	b1 < F8h [ ; start of four-byte sequence
	b2: as-integer u/2
	b3: as-integer u/3
	b4: as-integer u/4
	if all [
	b2 >= 80h b2 < C0h
	b3 >= 80h b3 < C0h
	b4 >= 80h b4 < C0h
	][
	cp: (b1 - F0h << 18) or
	(b2 - 80h << 12) or
	(b3 - 80h << 6) or
	(b4 - 80h)
	if all [
	cp <= 10FFFFh
	; cp > FFFFh ; optional test for overlong
	][
	res/value: cp
	u: u + 3
	]
	]
	]
	; true [
	; error case
	; ]
	]
	]
	]
	res: res + 1
	u: u + 1
	]
	res - res0
	]