Created
June 17, 2012 04:28
-
-
Save daurnimator/2943418 to your computer and use it in GitHub Desktop.
Adding UTF8 support for lpeg
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
local lpeg = require "lpeg" | |
local utf8_codepoint | |
do | |
-- decode a two-byte UTF-8 sequence | |
local function f2 (s) | |
local c1, c2 = string.byte(s, 1, 2) | |
return c1 * 64 + c2 - 12416 | |
end | |
-- decode a three-byte UTF-8 sequence | |
local function f3 (s) | |
local c1, c2, c3 = string.byte(s, 1, 3) | |
return (c1 * 64 + c2) * 64 + c3 - 925824 | |
end | |
-- decode a four-byte UTF-8 sequence | |
local function f4 (s) | |
local c1, c2, c3, c4 = string.byte(s, 1, 4) | |
return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 | |
end | |
local cont = lpeg.R("\128\191") -- continuation byte | |
utf8_codepoint = lpeg.R("\0\127") / string.byte | |
+ lpeg.R("\194\223") * cont / f2 | |
+ lpeg.R("\224\239") * cont * cont / f3 | |
+ lpeg.R("\240\244") * cont * cont * cont / f4 | |
end | |
local ffi = require "ffi" | |
ffi.cdef [[ | |
typedef bool UBool; | |
typedef int32_t UChar32; | |
UBool u_islower_49(UChar32 c); | |
UBool u_isupper_49(UChar32 c); | |
UBool u_isdigit_49(UChar32 c); | |
UBool u_isalpha_49(UChar32 c); | |
UBool u_isalnum_49(UChar32 c); | |
UBool u_isxdigit_49(UChar32 c); | |
UBool u_ispunct_49(UChar32 c); | |
UBool u_isgraph_49(UChar32 c); | |
UBool u_isspace_49(UChar32 c); | |
UBool u_iscntrl_49(UChar32 c); | |
UBool u_isprint_49(UChar32 c); | |
]] | |
local ICU = ffi.load ( "icuuc" ) | |
local utf8_locale = { | |
alnum = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isalnum_49(c) end ) ; | |
alpha = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isalpha_49(c) end ) ; | |
cntrl = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_iscntrl_49(c) end ) ; | |
digit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isdigit_49(c) end ) ; | |
graph = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isgraph_49(c) end ) ; | |
lower = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_islower_49(c) end ) ; | |
print = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isprint_49(c) end ) ; | |
punct = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_ispunct_49(c) end ) ; | |
space = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isspace_49(c) end ) ; | |
upper = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isupper_49(c) end ) ; | |
xdigit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isxdigit_49(c) end ) ; | |
} | |
return { | |
utf8_codepoint = utf8_codepoint ; | |
utf8_locale = utf8_locale ; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment