Last active
September 2, 2022 23:46
-
-
Save dhilst/580b626f31ce664cfc0e8dcce957eb02 to your computer and use it in GitHub Desktop.
OCaml Unicode codepoints to UTF-8 string
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open Printf | |
let (%) = Int.logor | |
let (<<) = Int.shift_left | |
let (>>) = Int.shift_right | |
let (&) = Int.logand | |
let utf_8_bytes_of_unicode i = | |
if i <= 0x007F | |
then | |
let b = Bytes.create 1 in | |
Bytes.set_int8 b 0 i; | |
Ok b | |
else if i <= 0x07FF | |
then begin | |
let five_high_bits = (i >> 6) & 0b11111 in | |
let six_low_bits = (i & 0b111111) in | |
let high = (0b11000000 % five_high_bits) << 8 in | |
let low = (0b10000000 % six_low_bits) in | |
let n = high % low in | |
let b = Bytes.create 2 in | |
Bytes.set_int16_be b 0 n; | |
Ok b | |
end | |
else if i <= 0xFFFF | |
then begin | |
let four_high_bits = (i >> 12) & 0b1111 in | |
let six_mid_bits = (i >> 6) & 0b111111 in | |
let six_low_bits = i & 0b111111 in | |
let high = (0b11100000 % four_high_bits) << 16 in | |
let mid = (0b10000000 % six_mid_bits) << 8 in | |
let low = (0b10000000 % six_low_bits) in | |
let n = high % mid % low in | |
let b = Bytes.create 3 in | |
Bytes.set_int32_be b 0 (Int32.of_int n); | |
Ok b | |
end | |
else if i <= 0x10FFFF | |
then | |
let three_hh_bits = (i >> 18) & 0b111 in | |
let six_hl_bits = (i >> 12) & 0b111111 in | |
let six_lh_bits = (i >> 6) & 0b111111 in | |
let six_ll_bits = i & 0b111111 in | |
let hh = (0b11110000 % three_hh_bits) << 24 in | |
let hl = (0b10000000 % six_hl_bits) << 16 in | |
let lh = (0b10000000 % six_lh_bits) << 8 in | |
let ll = (0b10000000 % six_ll_bits) in | |
let n = hh % hl % lh % ll in | |
let b = Bytes.create 4 in | |
Bytes.set_int32_be b 0 (Int32.of_int n); | |
Ok b | |
else Error (Printf.sprintf "invalid code point %X" i) | |
let result_of_opt e = function | |
| Some x -> Ok x | |
| None -> Error e | |
let (let*) = Result.bind | |
(* [escape_string_continue str idx b bidx] expects to be called when | |
[str.[idx] = '\\'], it: | |
* Extract the [escape_chars] from the string, for ["\\u03bb"] this | |
is 03bb | |
* Convert it to int and encode it as utf8 bytes | |
* Write these utf8 bytes to the [b] | |
* Search for the next '\\' | |
* if not found return the [b] as string | |
* if found, recurse | |
*) | |
let rec escape_string_continue str idx b bidx = | |
let idx = idx + 1 in | |
let* escape_chars_len = (match str.[idx] with | |
| 'u' -> Ok 4 | |
| 'x' -> Ok 2 | |
| _ -> Error "invalid escape sequence") in | |
let escape_chars = String.sub str (idx + 1) escape_chars_len in | |
let* as_int = Printf.sprintf "0x%s" escape_chars | |
|> int_of_string_opt | |
|> result_of_opt "invalid escape chars" in | |
let* utf8_bytes = utf_8_bytes_of_unicode as_int in | |
let utf8_len = Bytes.length utf8_bytes in | |
Bytes.blit utf8_bytes 0 b bidx utf8_len; | |
let bidx = bidx + utf8_len in | |
let idx = idx + escape_chars_len + 1 in | |
let bar_idx = String.index_from_opt str idx '\\' in | |
match bar_idx with | |
| None -> ( | |
let l = String.(length str - idx) in | |
Bytes.blit_string str idx b bidx l; | |
Ok (Bytes.sub_string b 0 (bidx + l))) | |
| Some bar_idx -> | |
let len = bar_idx - idx in | |
Bytes.blit_string str idx b bidx len; | |
escape_string_continue str bar_idx b (bidx + len) | |
let escape_string str = | |
let idx = String.index_opt str '\\' in | |
match idx with | |
| None -> Ok str | |
| Some idx -> begin | |
let len = String.length str in | |
let b = Bytes.create len in | |
Bytes.blit_string str 0 b 0 idx; | |
escape_string_continue str idx b idx | |
end | |
let () = | |
let unwrap = function | Ok x -> x | Error e -> failwith e in | |
assert (escape_string "Hello \\u03bb \\x77\\x6F\\x72\\x6C\\x64" |> unwrap = "Hello λ world") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment