Created
September 23, 2011 02:10
-
-
Save zeen/1236597 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function bin(x) | |
local s,m={} | |
for i=0,7 do | |
m=x%2;x=(x-m)/2; | |
s[8-i]=m; | |
end | |
return table.concat(s); | |
end | |
function hex(x) return ("0x%02X"):format(x); end | |
function shiftr(n, shift) | |
local pow = 2 ^ shift; | |
local mod = n % pow; | |
return (n - mod) / pow; | |
end | |
-- return math.floor(n / pow); | |
function getbits(n, shift, count) | |
n = shiftr(n, shift); | |
return n % (2 ^ count); | |
end | |
function utf8_char_raw(code) | |
local char = string.char; | |
if code >= 0 and code <= 0x7F then | |
return code; | |
elseif code >= 0x80 and code <= 0x7FF then | |
return 0x80 + 0x40 + getbits(code, 6, 5), 0x80 + getbits(code, 0, 6); | |
elseif code >= 0x800 and code <= 0xFFFF then | |
return 0x80 + 0x40 + 0x20 + getbits(code, 12, 4), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6); | |
elseif code >= 0x10000 and code <= 0x10FFFF then | |
return 0x80 + 0x40 + 0x20 + 0x10 + getbits(code, 18, 3), 0x80 + getbits(code, 12, 6), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6); | |
-- the below aren't defined by unicode | |
elseif code >= 0x10000 and code <= 0x1fffff then | |
return 0x80 + 0x40 + 0x20 + 0x10 + getbits(code, 18, 3), 0x80 + getbits(code, 12, 6), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6); | |
elseif code >= 0x200000 and code <= 0x3FFFFFF then | |
return 0x80 + 0x40 + 0x20 + 0x10 + 0x8 + getbits(code, 24, 2), 0x80 + getbits(code, 18, 6), 0x80 + getbits(code, 12, 6), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6); | |
elseif code >= 0x4000000 and code <= 0x7FFFFFFF then | |
return 0x80 + 0x40 + 0x20 + 0x10 + 0x8 + 0x4 + getbits(code, 30, 1), 0x80 + getbits(code, 24, 6), 0x80 + getbits(code, 18, 6), 0x80 + getbits(code, 12, 6), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6); | |
end | |
end | |
function utf8_char(code, ...) | |
local char = string.char; | |
if ... then | |
local r = {code, ...}; | |
for i=1,#r do r[i] = utf8_char(r[i]); end | |
return table.concat(r); | |
end | |
if code >= 0 and code <= 0x7F then | |
return char(code); | |
elseif code >= 0x80 and code <= 0x7FF then | |
return char(0x80 + 0x40 + getbits(code, 6, 5), 0x80 + getbits(code, 0, 6)); | |
elseif code >= 0x800 and code <= 0xFFFF then | |
return char(0x80 + 0x40 + 0x20 + getbits(code, 12, 4), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6)); | |
elseif code >= 0x10000 and code <= 0x10FFFF then | |
return char(0x80 + 0x40 + 0x20 + 0x10 + getbits(code, 18, 3), 0x80 + getbits(code, 12, 6), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6)); | |
-- the below aren't defined by unicode | |
elseif code >= 0x10000 and code <= 0x1fffff then | |
return char(0x80 + 0x40 + 0x20 + 0x10 + getbits(code, 18, 3), 0x80 + getbits(code, 12, 6), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6)); | |
elseif code >= 0x200000 and code <= 0x3FFFFFF then | |
return char(0x80 + 0x40 + 0x20 + 0x10 + 0x8 + getbits(code, 24, 2), 0x80 + getbits(code, 18, 6), 0x80 + getbits(code, 12, 6), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6)); | |
elseif code >= 0x4000000 and code <= 0x7FFFFFFF then | |
return char(0x80 + 0x40 + 0x20 + 0x10 + 0x8 + 0x4 + getbits(code, 30, 1), 0x80 + getbits(code, 24, 6), 0x80 + getbits(code, 18, 6), 0x80 + getbits(code, 12, 6), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6)); | |
end | |
end | |
function str2hex(s) | |
return s:gsub(".", function(x) return ("%02X"):format(x:byte()); end):gsub("..", " %1"):sub(2); | |
end | |
function str2bin(s) | |
return s:gsub(".", function(x) return bin(x:byte()); end):gsub("........", " %1"):sub(2); | |
end | |
local is_valid_utf8; do | |
local null = "\0"; | |
local function fix(s) return s:gsub("x(%x%x)", function(x) return string.char(tonumber(x, 16)) end); end | |
-- Patterns based on http://www.w3.org/International/questions/qa-forms-utf-8.en.php | |
-- verified in the Unicode specification (see http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf Table 3-7) | |
local ASCII = fix("[%zx01-x7F]"); | |
local non_overlong_2_byte = fix("[xC2-xDF][x80-xBF]"); | |
local excluding_overlongs = fix("xE0[xA0-xBF][x80-xBF]"); | |
local straight_3_byte = fix("[xE1-xECxEExEF][x80-xBF][x80-xBF]"); | |
local excluding_surrogates = fix("xED[x80-x9F][x80-xBF]"); | |
local planes_1_3 = fix("xF0[x90-xBF][x80-xBF][x80-xBF]"); | |
local planes_4_15 = fix("[xF1-xF3][x80-xBF][x80-xBF][x80-xBF]"); | |
local plane_16 = fix("xF4[x80-x8F][x80-xBF][x80-xBF]"); | |
local non_ASCII = fix("[x80-xFF]"); | |
local non_ASCII_run = fix("[x80-xFF]+"); | |
function is_valid_utf8(s) | |
for non_ascii in s:gmatch(non_ASCII_run) do | |
if non_ascii | |
:gsub(non_overlong_2_byte, null) | |
:gsub(excluding_overlongs, null) | |
:gsub(straight_3_byte, null) | |
:gsub(excluding_surrogates, null) | |
:gsub(planes_1_3, null) | |
:gsub(planes_4_15, null) | |
:gsub(plane_16, null) | |
:find(non_ASCII) | |
then return false; end | |
end | |
return true; | |
end | |
local invalid_xml_single = fix("[^x09x0Ax0Dx20-xFF]"); | |
local invalid_xml_double = fix("xFF[xFExFF]"); | |
function is_valid_xml_char(s) | |
return not s:find(invalid_xml_single) and not s:find(invalid_xml_double) and is_valid_utf8(s); | |
end | |
--print(is_valid_utf8("hello world")); | |
--print(is_valid_utf8(fix"xFF")); | |
--print(is_valid_utf8(fix"xFE")); | |
end | |
do | |
collectgarbage() print(collectgarbage('count')); | |
local function range_equal(a, b) | |
return a[1] == b[1] and a[2] == b[2]; | |
end | |
local function range_string(a) return "["..a[1].."-"..a[2].."]"; end | |
local function ranges_string(a) | |
local s = ""; | |
for i=1,#a do s = s..range_string(a[i]); end | |
return s; | |
end | |
local function range_merge(a, b) | |
if a[1] > b[2] then a,b = b,a; end | |
if a[2]+1 >= b[1] then | |
return {a[1], b[2]}; | |
end | |
end | |
local function merge(a, b) | |
if #a == #b then | |
--print("merge:", ranges_string(a), ranges_string(b)); | |
local unequal; | |
for i=1,#a do | |
if not range_equal(a[i], b[i]) then | |
if unequal then return nil; end -- two unequal | |
unequal = i; | |
end | |
end | |
if not unequal then return a; end -- same | |
-- just one unequal, do merge | |
local _c = range_merge(a[unequal], b[unequal]); | |
if _c then | |
local c = {}; | |
for i=1,#a do c[i] = a[i]; end | |
c[unequal] = _c; | |
--print("yes"); | |
return c; | |
end | |
end | |
end | |
--print("merge test:", merge({{229,229},{157,157},{155,155}}, {{229,229},{157,157},{156,156}})); | |
local splits_at = {{9},{13},{32},{194,128},{224,160,128},{225,128,128},{237,128,128},{238,160,128},{239,128,128},{239,191,128},{240,144,128,128},{241,128,128,128},{244,128,128,128}}; | |
local valid = {}; | |
function range(from, to) | |
for i=from, to do | |
local length = #valid; | |
local t = {utf8_char_raw(i)} | |
for i=1,#t do t[i] = {t[i], t[i]}; end | |
valid[#valid+1] = t; | |
while #valid > 1 do | |
local a, b = valid[#valid-1], valid[#valid]; | |
local c = merge(a, b); | |
if not c then break; end | |
valid[#valid-1] = c; | |
valid[#valid] = nil; | |
end | |
local _1,_2,_3,_4 = utf8_char_raw(i); | |
for j=1,#splits_at do | |
local a1,a2,a3,a4 = unpack(splits_at[j]); | |
if _4==a4 and _3==a3 and _2==a2 and _1==a1 then print("up: ", i, str2hex(utf8_char(i))) end | |
end | |
--if #valid > length then print("up: ", i, str2hex(utf8_char(i))) end | |
--if #valid < length then print("down: ", i, str2hex(utf8_char(i))) end | |
end | |
end | |
range(0x9, 0x9); | |
range(0xA, 0xA); | |
range(0xD, 0xD); | |
range(0x20, 0xD7FF); | |
range(0xE000, 0xFFFD); | |
range(0x10000, 0x10FFFF); | |
--range(0x0, 0x7F); | |
--range(0x80, 0x7FF); | |
--range(0x800, 0xFFFF); | |
--range(0x10000, 0x10FFFF); | |
local char = string.char; | |
local function range_string_hex(a) return "["..str2hex(char(a[1])).."-"..str2hex(char(a[2])).."]"; end | |
local function ranges_string_hex(a) | |
local s = ""; | |
for i=1,#a do s = s..range_string_hex(a[i]); end | |
return s; | |
end | |
local count = 0; | |
for _,range in ipairs(valid) do | |
print(ranges_string(range), ranges_string_hex(range)); | |
count = count + 1; | |
end | |
print(count); print(collectgarbage('count')); | |
end--]] | |
ranges = { automerge = true }; | |
function ranges:add(a, b) | |
if not b then b = a; end | |
if a > b then a,b = b,a end | |
local pos; | |
for i=1,#self do | |
local r = self[i]; | |
if a <= r[1] then | |
pos = i; | |
break; | |
end | |
end | |
if not pos then pos = #self+1; end | |
table.insert(self, pos, {a, b}); | |
if self.automerge then | |
if self:merge(pos - 1) == 0 then self:merge(pos); end | |
end | |
return self; | |
end | |
function ranges:subtract(a, b) | |
if not b then b = a; end | |
if a > b then a,b = b,a end | |
for i=#self,1,-1 do | |
local r=self[i]; | |
if r[2] < a then break; end -- deletion range too high | |
if r[1] <= b then -- some overlap with deletion range | |
if a > r[1] and b < r[2] then -- deletion makes a hole | |
table.insert(self, i+1, {b+1, r[2]}); | |
r[2] = a-1; | |
elseif a <= r[1] and b >= r[2] then -- full overlap | |
table.remove(self, i); | |
elseif b < r[2] then -- lower part may get chopped off | |
r[1] = b+1; | |
else -- upper part may get chopped off | |
r[2] = a-1; | |
end | |
end | |
end | |
return self; | |
end | |
function ranges:sort() | |
table.sort(self, function(a, b) return a[1] < b[1] or (a[1] == b[1] and a[2] < b[2]); end); | |
return self; | |
end | |
function ranges:merge(index) | |
local count = 0; | |
if index then | |
local r1 = self[index]; | |
if not r1 then return; end | |
local i = index + 1; | |
while true do | |
local r2 = self[i]; | |
if not r2 or r2[1] < r1[1] then break; end -- end or unsorted | |
if not(r1[2]+1 >= r2[1]) then break; end -- merge not possible | |
r1[2] = math.max(r1[2], r2[2]); | |
table.remove(self, i); | |
count = count + 1; | |
end | |
else | |
local i = 1; | |
while i < #self do | |
count = count + self:merge(i); | |
i = i + 1; | |
end | |
end | |
return count; | |
end | |
function ranges:__tostring() | |
local s = {}; | |
for i=1,#self do | |
local r = self[i]; | |
if r[1] == r[2] then | |
s[i] = r[1]; | |
else | |
s[i] = "["..r[1].."-"..r[2].."]"; | |
end | |
end | |
return "{ "..table.concat(s, ",").." }"; | |
end | |
function ranges:split(n) | |
for i=#self,1,-1 do | |
local r=self[i]; | |
if r[1] < n and r[2] >= n then -- hit | |
table.insert(self, i+1, {n, r[2]}); | |
r[2] = n-1; | |
elseif r[2] < n then | |
break; -- went too low | |
end | |
end | |
return self; | |
end | |
setmetatable(ranges, ranges); | |
ranges:add(0x9, 0x9); | |
ranges:add(0xA, 0xA); | |
ranges:add(0xD, 0xD); | |
ranges:add(0x20, 0xD7FF); | |
ranges:add(0xE000, 0xFFFD); | |
ranges:add(0x10000, 0x10FFFF); | |
print(ranges); | |
ranges:split(0x80); | |
ranges:split(0x800); | |
ranges:split(0x10000); | |
ranges:split(0x200000); | |
ranges:split(0x4000000); | |
print(ranges); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment