Skip to content

Instantly share code, notes, and snippets.

@takahisa
Created July 25, 2017 11:32
Show Gist options
  • Save takahisa/82bf4c22ed0551312687d32622adc434 to your computer and use it in GitHub Desktop.
Save takahisa/82bf4c22ed0551312687d32622adc434 to your computer and use it in GitHub Desktop.
unicode.ml
(*
* Copyright (c) 2017 Takahisa Watanabe <[email protected]> All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*)
module Uchar = struct
include UCoreLib.UChar
let halfwidth = 1
let fullwidth = 2
(* uniset -32 eaw:W + eaw:F *)
let unicode_block_fullwidth = [|
(4352, 4447);
(8986, 8987);
(9001, 9002);
(9193, 9196);
(9200, 9200);
(9203, 9203);
(9725, 9726);
(9748, 9749);
(9800, 9811);
(9855, 9855);
(9875, 9875);
(9889, 9889);
(9898, 9899);
(9917, 9918);
(9924, 9925);
(9934, 9934);
(9940, 9940);
(9962, 9962);
(9970, 9971);
(9973, 9973);
(9978, 9978);
(9981, 9981);
(9989, 9989);
(9994, 9995);
(10024, 10024);
(10060, 10060);
(10062, 10062);
(10067, 10069);
(10071, 10071);
(10133, 10135);
(10160, 10160);
(10175, 10175);
(11035, 11036);
(11088, 11088);
(11093, 11093);
(11904, 11929);
(11931, 12019);
(12032, 12245);
(12272, 12283);
(12288, 12350);
(12353, 12438);
(12441, 12543);
(12549, 12590);
(12593, 12686);
(12688, 12730);
(12736, 12771);
(12784, 12830);
(12832, 12871);
(12880, 13054);
(13056, 19903);
(19968, 42124);
(42128, 42182);
(43360, 43388);
(44032, 55203);
(63744, 64255);
(65040, 65049);
(65072, 65106);
(65108, 65126);
(65128, 65131);
(65281, 65376);
(65504, 65510);
(94176, 94177);
(94208, 100332);
(100352, 101106);
(110592, 110878);
(110960, 111355);
(126980, 126980);
(127183, 127183);
(127374, 127374);
(127377, 127386);
(127488, 127490);
(127504, 127547);
(127552, 127560);
(127568, 127569);
(127584, 127589);
(127744, 127776);
(127789, 127797);
(127799, 127868);
(127870, 127891);
(127904, 127946);
(127951, 127955);
(127968, 127984);
(127988, 127988);
(127992, 128062);
(128064, 128064);
(128066, 128252);
(128255, 128317);
(128331, 128334);
(128336, 128359);
(128378, 128378);
(128405, 128406);
(128420, 128420);
(128507, 128591);
(128640, 128709);
(128716, 128716);
(128720, 128722);
(128747, 128748);
(128756, 128760);
(129296, 129342);
(129344, 129356);
(129360, 129387);
(129408, 129431);
(129472, 129472);
(129488, 129510);
(131072, 196605);
(196608, 262141)
|];;
(* uniset -32 eaw:A *)
let unicode_block_ambiguous = [|
(161, 161);
(164, 164);
(167, 168);
(170, 170);
(173, 174);
(176, 180);
(182, 186);
(188, 191);
(198, 198);
(208, 208);
(215, 216);
(222, 225);
(230, 230);
(232, 234);
(236, 237);
(240, 240);
(242, 243);
(247, 250);
(252, 252);
(254, 254);
(257, 257);
(273, 273);
(275, 275);
(283, 283);
(294, 295);
(299, 299);
(305, 307);
(312, 312);
(319, 322);
(324, 324);
(328, 331);
(333, 333);
(338, 339);
(358, 359);
(363, 363);
(462, 462);
(464, 464);
(466, 466);
(468, 468);
(470, 470);
(472, 472);
(474, 474);
(476, 476);
(593, 593);
(609, 609);
(708, 708);
(711, 711);
(713, 715);
(717, 717);
(720, 720);
(728, 731);
(733, 733);
(735, 735);
(768, 879);
(913, 929);
(931, 937);
(945, 961);
(963, 969);
(1025, 1025);
(1040, 1103);
(1105, 1105);
(8208, 8208);
(8211, 8214);
(8216, 8217);
(8220, 8221);
(8224, 8226);
(8228, 8231);
(8240, 8240);
(8242, 8243);
(8245, 8245);
(8251, 8251);
(8254, 8254);
(8308, 8308);
(8319, 8319);
(8321, 8324);
(8364, 8364);
(8451, 8451);
(8453, 8453);
(8457, 8457);
(8467, 8467);
(8470, 8470);
(8481, 8482);
(8486, 8486);
(8491, 8491);
(8531, 8532);
(8539, 8542);
(8544, 8555);
(8560, 8569);
(8585, 8585);
(8592, 8601);
(8632, 8633);
(8658, 8658);
(8660, 8660);
(8679, 8679);
(8704, 8704);
(8706, 8707);
(8711, 8712);
(8715, 8715);
(8719, 8719);
(8721, 8721);
(8725, 8725);
(8730, 8730);
(8733, 8736);
(8739, 8739);
(8741, 8741);
(8743, 8748);
(8750, 8750);
(8756, 8759);
(8764, 8765);
(8776, 8776);
(8780, 8780);
(8786, 8786);
(8800, 8801);
(8804, 8807);
(8810, 8811);
(8814, 8815);
(8834, 8835);
(8838, 8839);
(8853, 8853);
(8857, 8857);
(8869, 8869);
(8895, 8895);
(8978, 8978);
(9312, 9449);
(9451, 9547);
(9552, 9587);
(9600, 9615);
(9618, 9621);
(9632, 9633);
(9635, 9641);
(9650, 9651);
(9654, 9655);
(9660, 9661);
(9664, 9665);
(9670, 9672);
(9675, 9675);
(9678, 9681);
(9698, 9701);
(9711, 9711);
(9733, 9734);
(9737, 9737);
(9742, 9743);
(9756, 9756);
(9758, 9758);
(9792, 9792);
(9794, 9794);
(9824, 9825);
(9827, 9829);
(9831, 9834);
(9836, 9837);
(9839, 9839);
(9886, 9887);
(9919, 9919);
(9926, 9933);
(9935, 9939);
(9941, 9953);
(9955, 9955);
(9960, 9961);
(9963, 9969);
(9972, 9972);
(9974, 9977);
(9979, 9980);
(9982, 9983);
(10045, 10045);
(10102, 10111);
(11094, 11097);
(12872, 12879);
(57344, 63743);
(65024, 65039);
(65533, 65533);
(127232, 127242);
(127248, 127277);
(127280, 127337);
(127344, 127373);
(127375, 127376);
(127387, 127404);
(917760, 917999);
(983040, 1048573);
(1048576, 1114109)
|];;
let search c block =
let rec loop i n =
(i < n) && (let (min, max) = block.(i) in (min <= c && c <= max || loop (i+1) n))
in loop 0 (Array.length block)
let is_east_asia =
ref true
let width u =
let c = code u in
if search c unicode_block_fullwidth || (!is_east_asia && search c unicode_block_ambiguous) then
fullwidth
else
halfwidth
let is_halfwidth u = (width u = halfwidth)
let is_fullwidth u = (width u = fullwidth)
end
module Utext = struct
include UCoreLib.Text
let width s =
let rec loop n = function
| Some it ->
loop (n + Uchar.width (value it)) (next it)
| None ->
n
in loop 0 (Some (first s))
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment