Last active
January 9, 2019 14:11
-
-
Save alphaKAI/17c0b7e16258f44e0ef9ba7b5e47d230 to your computer and use it in GitHub Desktop.
OCamlとF#でそれぞれeast_asian_widthを計算する (これを移植しました: http://www.shibu.jp/techmemo/zenhan.html)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#require "core_kernel";; | |
#require "camomile";; | |
(* | |
Requierments: | |
- Core_kernel | |
- Camomile | |
Install With: | |
opam install core_kernel camomile | |
*) | |
module Unicode = struct | |
open Core_kernel | |
let start_group = [ | |
161; 164; 167; 170; 173; 176; 182; 188; 198; 208; 215; 222; 230; 232; 236; 240; 242; 247; 252; 254; 257; 273; 275; | |
283; 294; 299; 305; 312; 319; 324; 328; 333; 338; 358; 363; 462; 464; 466; 468; 470; 472; 474; 476; 593; 609; 708; | |
711; 713; 717; 720; 728; 733; 735; 768; 913; 945; 963; 1025; 1040; 1105; 4352; 4515; 4602; 8208; 8211; 8216; 8220; | |
8224; 8228; 8240; 8242; 8245; 8251; 8254; 8308; 8319; 8321; 8364; 8451; 8453; 8457; 8467; 8470; 8481; 8486; 8491; | |
8531; 8539; 8544; 8560; 8585; 8632; 8658; 8660; 8679; 8704; 8706; 8711; 8715; 8719; 8721; 8725; 8730; 8733; 8739; | |
8741; 8743; 8750; 8756; 8764; 8776; 8780; 8786; 8800; 8804; 8810; 8814; 8834; 8838; 8853; 8857; 8869; 8895; 8978; | |
9001; 9312; 9451; 9552; 9600; 9618; 9632; 9635; 9650; 9654; 9660; 9664; 9670; 9675; 9678; 9698; 9711; 9733; 9737; | |
9742; 9748; 9756; 9758; 9792; 9794; 9824; 9827; 9831; 9836; 9839; 9886; 9918; 9924; 9935; 9955; 9960; 10045; 10071; | |
10102; 11093; 11904; 12353; 13312; 13312; 19894; 19968; 19968; 40908; 40960; 43360; 44032; 55216; 57344; 63744; 63744; | |
64046; 64110; 64218; 65024; 65072; 65281; 65504; 65533; 110592; 127232; 127280; 127488; 131072; 131072; 173783; 173824; | |
173824; 177973; 177984; 177984; 178206; 194560; 194560; 195102; 196608; 917760; 983040; 1048576] | |
let end_group = [ | |
161; 164; 168; 170; 174; 180; 186; 191; 198; 208; 216; 225; 230; 234; 237; 240; 243; 250; 252; 254; 257; 273; 275; 283; 295; | |
299; 307; 312; 322; 324; 331; 333; 339; 359; 363; 462; 464; 466; 468; 470; 472; 474; 476; 593; 609; 708; 711; 715; 717; 720; | |
731; 733; 735; 879; 937; 961; 969; 1025; 1103; 1105; 4447; 4519; 4607; 8208; 8214; 8217; 8221; 8226; 8231; 8240; 8243; 8245; | |
8251; 8254; 8308; 8319; 8324; 8364; 8451; 8453; 8457; 8467; 8470; 8482; 8486; 8491; 8532; 8542; 8555; 8569; 8601; 8633; 8658; | |
8660; 8679; 8704; 8707; 8712; 8715; 8719; 8721; 8725; 8730; 8736; 8739; 8741; 8748; 8750; 8759; 8765; 8776; 8780; 8786; 8801; | |
8807; 8811; 8815; 8835; 8839; 8853; 8857; 8869; 8895; 8978; 9002; 9449; 9547; 9587; 9615; 9621; 9633; 9641; 9651; 9655; 9661; | |
9665; 9672; 9675; 9681; 9701; 9711; 9734; 9737; 9743; 9749; 9756; 9758; 9792; 9794; 9825; 9829; 9834; 9837; 9839; 9887; 9919; | |
9933; 9953; 9955; 9983; 10045; 10071; 10111; 11097; 12350; 13311; 19893; 19903; 19903; 40907; 40959; 40959; 42182; 43388; 55203; | |
55291; 63743; 64047; 64111; 64217; 64255; 64255; 65049; 65131; 65376; 65510; 65533; 110593; 127277; 127386; 127569; 173782; | |
173791; 173823; 177972; 177983; 178205; 178207; 194367; 194559; 195101; 195103; 196605; 262141; 917999; 1048573; 1114109] | |
let kSurrogateBits = 10 | |
let kHighSurrogateMin = 0xD800 | |
let kHighSurrogateMax = 0xDBFF | |
let kHighSurrogateMask = (1 lsl kSurrogateBits) - 1 | |
let kLowSurrogateMin = 0xDC00 | |
let kLowSurrogateMax = 0xDFFF | |
let kLowSurrogateMask = (1 lsl kSurrogateBits) - 1 | |
let kSurrogateMin = kHighSurrogateMin | |
let kSurrogateMax = kLowSurrogateMax | |
let kSurrogateMask = (1 lsl (kSurrogateBits + 1)) - 1 | |
let kHighSurrogateOffset = kHighSurrogateMin - (0x10000 lsr 10) | |
let rec binary_range_search_impl heads tails value ~head ~tail = | |
if head > tail then | |
if tail < 0 || List.length tails - 1 < tail then | |
false | |
else | |
value <= List.nth_exn tails tail | |
else | |
let where = ((head + tail) |> float_of_int) /. 2. | |
|> Float.round_down |> int_of_float in | |
let v = List.nth_exn heads where in | |
if value = v then | |
true | |
else if value < v then | |
binary_range_search_impl heads tails value ~head ~tail:(where - 1) | |
else | |
binary_range_search_impl heads tails value ~head:(where + 1) ~tail | |
let binary_range_search heads tails value = | |
let head = 0 | |
and tail = List.length heads - 1 in | |
binary_range_search_impl heads tails value ~head ~tail | |
let isHighSurrogate uc = (uc land (lnot kHighSurrogateMask)) = kHighSurrogateMin | |
let isLowSurrogate uc = (uc land (lnot kLowSurrogateMask)) = kLowSurrogateMin | |
let isSurrogate uc = (uc land (lnot kSurrogateMask)) = kSurrogateMin | |
let decodeSurrogatePair high low = ((high land kHighSurrogateMask) lsl kSurrogateBits) + (low land kLowSurrogateMask) + 0x10000 | |
open CamomileLibrary | |
let (>>) f g x y = g (f x y) | |
let charCodeAt = UTF8.look >> UChar.code | |
exception UTF16_DECODE_ERROR | |
let rec east_asian_width ?(width=0) ?(i=0) text = | |
if i >= String.length text then | |
width | |
else | |
let proc_for_next ~code ~i = | |
if binary_range_search start_group end_group code then | |
east_asian_width ~width:(width + 2) ~i:(UTF8.next text i) text | |
else | |
east_asian_width ~width:(width + 1) ~i:(UTF8.next text i) text | |
in | |
let code = charCodeAt text i in | |
if isSurrogate code then | |
if not (isHighSurrogate code) then | |
raise UTF16_DECODE_ERROR | |
else | |
let i = UTF8.next text i in | |
let low_code = charCodeAt text i in | |
if not (isLowSurrogate low_code) then | |
raise UTF16_DECODE_ERROR | |
else | |
let code = decodeSurrogatePair code low_code in | |
proc_for_next ~code ~i | |
else | |
proc_for_next ~code ~i | |
end | |
let _ = | |
Printf.printf "east_asian_width of abc : %d\n" (Unicode.east_asian_width "abc"); | |
Printf.printf "east_asian_width of あいうえお : %d\n" (Unicode.east_asian_width "あいうえお"); | |
Printf.printf "east_asian_width of abcあいうえお : %d\n" (Unicode.east_asian_width "abcあいうえお"); | |
Printf.printf "east_asian_width of あaいうbえcお : %d\n" (Unicode.east_asian_width "あaいうbえcお") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Program | |
module Unicode = | |
let start_group = [ | |
161; 164; 167; 170; 173; 176; 182; 188; 198; 208; 215; 222; 230; 232; 236; 240; 242; 247; 252; 254; 257; 273; 275; | |
283; 294; 299; 305; 312; 319; 324; 328; 333; 338; 358; 363; 462; 464; 466; 468; 470; 472; 474; 476; 593; 609; 708; | |
711; 713; 717; 720; 728; 733; 735; 768; 913; 945; 963; 1025; 1040; 1105; 4352; 4515; 4602; 8208; 8211; 8216; 8220; | |
8224; 8228; 8240; 8242; 8245; 8251; 8254; 8308; 8319; 8321; 8364; 8451; 8453; 8457; 8467; 8470; 8481; 8486; 8491; | |
8531; 8539; 8544; 8560; 8585; 8632; 8658; 8660; 8679; 8704; 8706; 8711; 8715; 8719; 8721; 8725; 8730; 8733; 8739; | |
8741; 8743; 8750; 8756; 8764; 8776; 8780; 8786; 8800; 8804; 8810; 8814; 8834; 8838; 8853; 8857; 8869; 8895; 8978; | |
9001; 9312; 9451; 9552; 9600; 9618; 9632; 9635; 9650; 9654; 9660; 9664; 9670; 9675; 9678; 9698; 9711; 9733; 9737; | |
9742; 9748; 9756; 9758; 9792; 9794; 9824; 9827; 9831; 9836; 9839; 9886; 9918; 9924; 9935; 9955; 9960; 10045; 10071; | |
10102; 11093; 11904; 12353; 13312; 13312; 19894; 19968; 19968; 40908; 40960; 43360; 44032; 55216; 57344; 63744; 63744; | |
64046; 64110; 64218; 65024; 65072; 65281; 65504; 65533; 110592; 127232; 127280; 127488; 131072; 131072; 173783; 173824; | |
173824; 177973; 177984; 177984; 178206; 194560; 194560; 195102; 196608; 917760; 983040; 1048576] | |
let end_group = [ | |
161; 164; 168; 170; 174; 180; 186; 191; 198; 208; 216; 225; 230; 234; 237; 240; 243; 250; 252; 254; 257; 273; 275; 283; 295; | |
299; 307; 312; 322; 324; 331; 333; 339; 359; 363; 462; 464; 466; 468; 470; 472; 474; 476; 593; 609; 708; 711; 715; 717; 720; | |
731; 733; 735; 879; 937; 961; 969; 1025; 1103; 1105; 4447; 4519; 4607; 8208; 8214; 8217; 8221; 8226; 8231; 8240; 8243; 8245; | |
8251; 8254; 8308; 8319; 8324; 8364; 8451; 8453; 8457; 8467; 8470; 8482; 8486; 8491; 8532; 8542; 8555; 8569; 8601; 8633; 8658; | |
8660; 8679; 8704; 8707; 8712; 8715; 8719; 8721; 8725; 8730; 8736; 8739; 8741; 8748; 8750; 8759; 8765; 8776; 8780; 8786; 8801; | |
8807; 8811; 8815; 8835; 8839; 8853; 8857; 8869; 8895; 8978; 9002; 9449; 9547; 9587; 9615; 9621; 9633; 9641; 9651; 9655; 9661; | |
9665; 9672; 9675; 9681; 9701; 9711; 9734; 9737; 9743; 9749; 9756; 9758; 9792; 9794; 9825; 9829; 9834; 9837; 9839; 9887; 9919; | |
9933; 9953; 9955; 9983; 10045; 10071; 10111; 11097; 12350; 13311; 19893; 19903; 19903; 40907; 40959; 40959; 42182; 43388; 55203; | |
55291; 63743; 64047; 64111; 64217; 64255; 64255; 65049; 65131; 65376; 65510; 65533; 110593; 127277; 127386; 127569; 173782; | |
173791; 173823; 177972; 177983; 178205; 178207; 194367; 194559; 195101; 195103; 196605; 262141; 917999; 1048573; 1114109] | |
let kSurrogateBits = 10 | |
let kHighSurrogateMin = 0xD800 | |
let kHighSurrogateMax = 0xDBFF | |
let kHighSurrogateMask = (1 <<< kSurrogateBits) - 1 | |
let kLowSurrogateMin = 0xDC00 | |
let kLowSurrogateMax = 0xDFFF | |
let kLowSurrogateMask = (1 <<< kSurrogateBits) - 1 | |
let kSurrogateMin = kHighSurrogateMin | |
let kSurrogateMax = kLowSurrogateMax | |
let kSurrogateMask = (1 <<< (kSurrogateBits + 1)) - 1 | |
let kHighSurrogateOffset = kHighSurrogateMin - (0x10000 >>> 10) | |
let binary_range_search heads tails value = | |
let rec binary_range_search_impl heads tails value head tail = | |
if head > tail then | |
if tail < 0 || List.length tails - 1 < tail then | |
false | |
else | |
value <= List.item tail tails | |
else | |
let where = (float(head + tail)) / 2. |> floor |> int | |
let v = List.item where heads | |
if value = v then | |
true | |
else if value < v then | |
binary_range_search_impl heads tails value head (where - 1) | |
else | |
binary_range_search_impl heads tails value (where + 1) tail | |
let head = 0 | |
let tail = List.length heads - 1 | |
binary_range_search_impl heads tails value head tail | |
let isHighSurrogate uc = (uc &&& (~~~ kHighSurrogateMask)) = kHighSurrogateMin | |
let isLowSurrogate uc = (uc &&& (~~~ kLowSurrogateMask)) = kLowSurrogateMin | |
let isSurrogate uc = (uc &&& (~~~ kSurrogateMask)) = kSurrogateMin | |
let decodeSurrogatePair high low = ((high &&& kHighSurrogateMask) <<< kSurrogateBits) + (low &&& kLowSurrogateMask) + 0x10000 | |
let charCodeAt (str : string) i = int(str.[i]) | |
exception UTF16DecodeException | |
let east_asian_width text = | |
let rec east_asian_width_impl text width i = | |
if i >= String.length text then | |
width | |
else | |
let proc_for_next code i = | |
if binary_range_search start_group end_group code then | |
east_asian_width_impl text (width + 2) (i + 1) | |
else | |
east_asian_width_impl text (width + 1) (i + 1) | |
let code = charCodeAt text i | |
if isSurrogate code then | |
if not (isHighSurrogate code) then | |
raise UTF16DecodeException | |
else | |
let i = i + 1 | |
let low_code = charCodeAt text i | |
if not (isLowSurrogate low_code) then | |
raise UTF16DecodeException | |
else | |
let code = decodeSurrogatePair code low_code | |
proc_for_next code i | |
else | |
proc_for_next code i | |
east_asian_width_impl text 0 0 | |
printfn "east_asian_width of abc : %d" (Unicode.east_asian_width "abc") | |
printfn "east_asian_width of あいうえお : %d" (Unicode.east_asian_width "あいうえお") | |
printfn "east_asian_width of abcあいうえお : %d" (Unicode.east_asian_width "abcあいうえお") | |
printfn "east_asian_width of あaいうbえcお : %d" (Unicode.east_asian_width "あaいうbえcお") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment