Skip to content

Instantly share code, notes, and snippets.

@dhilst
Last active September 2, 2022 23:46
Show Gist options
  • Save dhilst/580b626f31ce664cfc0e8dcce957eb02 to your computer and use it in GitHub Desktop.
Save dhilst/580b626f31ce664cfc0e8dcce957eb02 to your computer and use it in GitHub Desktop.
OCaml Unicode codepoints to UTF-8 string
open Printf
let (%) = Int.logor
let (<<) = Int.shift_left
let (>>) = Int.shift_right
let (&) = Int.logand
let utf_8_bytes_of_unicode i =
if i <= 0x007F
then
let b = Bytes.create 1 in
Bytes.set_int8 b 0 i;
Ok b
else if i <= 0x07FF
then begin
let five_high_bits = (i >> 6) & 0b11111 in
let six_low_bits = (i & 0b111111) in
let high = (0b11000000 % five_high_bits) << 8 in
let low = (0b10000000 % six_low_bits) in
let n = high % low in
let b = Bytes.create 2 in
Bytes.set_int16_be b 0 n;
Ok b
end
else if i <= 0xFFFF
then begin
let four_high_bits = (i >> 12) & 0b1111 in
let six_mid_bits = (i >> 6) & 0b111111 in
let six_low_bits = i & 0b111111 in
let high = (0b11100000 % four_high_bits) << 16 in
let mid = (0b10000000 % six_mid_bits) << 8 in
let low = (0b10000000 % six_low_bits) in
let n = high % mid % low in
let b = Bytes.create 3 in
Bytes.set_int32_be b 0 (Int32.of_int n);
Ok b
end
else if i <= 0x10FFFF
then
let three_hh_bits = (i >> 18) & 0b111 in
let six_hl_bits = (i >> 12) & 0b111111 in
let six_lh_bits = (i >> 6) & 0b111111 in
let six_ll_bits = i & 0b111111 in
let hh = (0b11110000 % three_hh_bits) << 24 in
let hl = (0b10000000 % six_hl_bits) << 16 in
let lh = (0b10000000 % six_lh_bits) << 8 in
let ll = (0b10000000 % six_ll_bits) in
let n = hh % hl % lh % ll in
let b = Bytes.create 4 in
Bytes.set_int32_be b 0 (Int32.of_int n);
Ok b
else Error (Printf.sprintf "invalid code point %X" i)
let result_of_opt e = function
| Some x -> Ok x
| None -> Error e
let (let*) = Result.bind
(* [escape_string_continue str idx b bidx] expects to be called when
[str.[idx] = '\\'], it:
* Extract the [escape_chars] from the string, for ["\\u03bb"] this
is 03bb
* Convert it to int and encode it as utf8 bytes
* Write these utf8 bytes to the [b]
* Search for the next '\\'
* if not found return the [b] as string
* if found, recurse
*)
let rec escape_string_continue str idx b bidx =
let idx = idx + 1 in
let* escape_chars_len = (match str.[idx] with
| 'u' -> Ok 4
| 'x' -> Ok 2
| _ -> Error "invalid escape sequence") in
let escape_chars = String.sub str (idx + 1) escape_chars_len in
let* as_int = Printf.sprintf "0x%s" escape_chars
|> int_of_string_opt
|> result_of_opt "invalid escape chars" in
let* utf8_bytes = utf_8_bytes_of_unicode as_int in
let utf8_len = Bytes.length utf8_bytes in
Bytes.blit utf8_bytes 0 b bidx utf8_len;
let bidx = bidx + utf8_len in
let idx = idx + escape_chars_len + 1 in
let bar_idx = String.index_from_opt str idx '\\' in
match bar_idx with
| None -> (
let l = String.(length str - idx) in
Bytes.blit_string str idx b bidx l;
Ok (Bytes.sub_string b 0 (bidx + l)))
| Some bar_idx ->
let len = bar_idx - idx in
Bytes.blit_string str idx b bidx len;
escape_string_continue str bar_idx b (bidx + len)
let escape_string str =
let idx = String.index_opt str '\\' in
match idx with
| None -> Ok str
| Some idx -> begin
let len = String.length str in
let b = Bytes.create len in
Bytes.blit_string str 0 b 0 idx;
escape_string_continue str idx b idx
end
let () =
let unwrap = function | Ok x -> x | Error e -> failwith e in
assert (escape_string "Hello \\u03bb \\x77\\x6F\\x72\\x6C\\x64" |> unwrap = "Hello λ world")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment