-
-
Save hinrik/2949801 to your computer and use it in GitHub Desktop.
UTF-8 character classes for LPeG, using ICU4Lua
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
local lpeg = require 'lpeg' | |
local U = require 'icu.ustring' | |
local re = require 'icu.regex' | |
local utf8_codepoint | |
do | |
-- decode a two-byte UTF-8 sequence | |
local function f2 (s) | |
local c1, c2 = string.byte(s, 1, 2) | |
return c1 * 64 + c2 - 12416 | |
end | |
-- decode a three-byte UTF-8 sequence | |
local function f3 (s) | |
local c1, c2, c3 = string.byte(s, 1, 3) | |
return (c1 * 64 + c2) * 64 + c3 - 925824 | |
end | |
-- decode a four-byte UTF-8 sequence | |
local function f4 (s) | |
local c1, c2, c3, c4 = string.byte(s, 1, 4) | |
return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 | |
end | |
local cont = lpeg.R("\128\191") -- continuation byte | |
utf8_codepoint = lpeg.R("\0\127") / string.byte | |
+ lpeg.R("\194\223") * cont / f2 | |
+ lpeg.R("\224\239") * cont * cont / f3 | |
+ lpeg.R("\240\244") * cont * cont * cont / f4 | |
end | |
local alnum = re.compile('^\\p{alnum}$') | |
local alpha = re.compile('^\\p{alpha}$') | |
local cntrl = re.compile('^\\p{cntrl}$') | |
local digit = re.compile('^\\p{digit}$') | |
local graph = re.compile('^\\p{graph}$') | |
local lower = re.compile('^\\p{lower}$') | |
local print = re.compile('^\\p{print}$') | |
local punct = re.compile('^\\p{punct}$') | |
local space = re.compile('^\\p{space}$') | |
local upper = re.compile('^\\p{upper}$') | |
local xdigit = re.compile('^\\p{xdigit}$') | |
return { | |
alnum = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(alnum, U.char(c)) end ) ; | |
alpha = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(alpha, U.char(c)) end ) ; | |
cntrl = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(cntrl, U.char(c)) end ) ; | |
digit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(digit, U.char(c)) end ) ; | |
graph = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(graph, U.char(c)) end ) ; | |
lower = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(lower, U.char(c)) end ) ; | |
print = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(print, U.char(c)) end ) ; | |
punct = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(punct, U.char(c)) end ) ; | |
space = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(space, U.char(c)) end ) ; | |
upper = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(upper, U.char(c)) end ) ; | |
xdigit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(digit, U.char(c)) end ) ; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment