Created
June 20, 2012 08:50
-
-
Save daurnimator/2958879 to your computer and use it in GitHub Desktop.
Add utf8 locale to lpeg using ICU (from the C side)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "lua.h" | |
#include "lauxlib.h" | |
#include "lpeg.h" | |
#include "unicode/utf.h" | |
#include "unicode/uchar.h" | |
static Newpf get_l_newpf ( lua_State *L ) { | |
Newpf newpattern; | |
lua_getfield ( L , LUA_REGISTRYINDEX , KEYNEWPATT ); | |
if (!(newpattern = (Newpf)lua_tocfunction ( L , -1 ) )) { | |
lua_pushstring ( L , "newpf not found" ); | |
lua_error ( L ); /* longjmp */ | |
} | |
lua_pop(L, 1); | |
return newpattern; | |
} | |
static const char * match_utf8_n (const char *s, const char *e, const char *o, const void *ud) { | |
int n = *(int*)ud; | |
int i = s-o; | |
UChar32 codepoint; | |
for ( ; n > 0 ; n-- ) { | |
if (i >= (e-o)) return NULL; | |
U8_NEXT ( o , i , e-o , codepoint ); | |
if ( codepoint < 0 ) return NULL; | |
} | |
return o+i; | |
} | |
static int nchars ( lua_State *L ) { | |
int n = luaL_checkint ( L , 1 ); | |
get_l_newpf ( L ) ( L , match_utf8_n , &n , sizeof(int) ); | |
return 1; | |
} | |
static const char * match_utf8_range (const char *s, const char *e, const char *o, const void *ud) { | |
const UChar32* codepoint_range = ud; | |
int i = s-o; | |
UChar32 codepoint; | |
U8_NEXT ( o , i , e-o , codepoint ); | |
if ( codepoint < 0 ) return NULL; | |
if ( codepoint < codepoint_range[0] || codepoint_range[1] < codepoint ) return NULL; | |
return o+i; | |
} | |
static int range ( lua_State *L ) { | |
size_t l; | |
const char *r = luaL_checklstring(L, 1, &l); | |
UChar32 codepoints[2]; | |
int i = 0; | |
U8_NEXT ( r , i , l , codepoints[0] ); | |
U8_NEXT ( r , i , l , codepoints[1] ); | |
luaL_argcheck(L, codepoints[0] >= 0 && codepoints[1] >= 0 , 1, "range must have two valid characters"); | |
get_l_newpf ( L ) ( L , match_utf8_range , codepoints , sizeof(codepoints) ); | |
return 1; | |
} | |
static const char * match_utf8_class (const char *s, const char *e, const char *o, const void *ud) { | |
UBool (*validator)(UChar32) = *(UBool (**)(UChar32))ud; | |
int i = s-o; | |
UChar32 codepoint; | |
U8_NEXT ( o , i , e-o , codepoint ); | |
if ( codepoint < 0 ) return NULL; | |
if ( !validator(codepoint) ) return NULL; | |
return o+i; | |
} | |
/* Creates a pattern from `validator` and saves it in the table on top of the stack under the field `name` */ | |
static void reg_class ( lua_State *L , const char * name , UBool (*validator)(UChar32) ) { | |
get_l_newpf ( L ) ( L , match_utf8_class , (void*)&validator , sizeof( &validator ) ); | |
lua_setfield ( L , -2 , name ); | |
} | |
static int locale ( lua_State *L ) { | |
lua_newtable ( L ); | |
reg_class(L,"alnum",&u_isalnum); | |
reg_class(L,"alpha",&u_isalpha); | |
reg_class(L,"cntrl",&u_iscntrl); | |
reg_class(L,"digit",&u_isdigit); | |
reg_class(L,"graph",&u_isgraph); | |
reg_class(L,"lower",&u_islower); | |
reg_class(L,"print",&u_isprint); | |
reg_class(L,"punct",&u_ispunct); | |
reg_class(L,"space",&u_isspace); | |
reg_class(L,"upper",&u_isupper); | |
reg_class(L,"xdigit",&u_isxdigit); | |
return 1; | |
} | |
static struct luaL_Reg funcs[] = { | |
{"N", nchars}, | |
{"R", range}, | |
{"locale", locale}, | |
{NULL, NULL} | |
}; | |
int luaopen_lpeg_utf8 ( lua_State *L ) { | |
/* require "lpeg" */ | |
lua_getfield ( L , LUA_GLOBALSINDEX , "require" ); | |
lua_pushstring ( L , "lpeg" ); | |
lua_call ( L , 1 , 0 ); | |
lua_newtable ( L ); | |
luaL_register(L, NULL, funcs); | |
return 1; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
LIBNAME = lpeg_utf8 | |
OUT = $(LIBNAME).so | |
LUADIR = /usr/include/lua5.1/ | |
LPEGDIR = ../lpeg-0.10.2/ | |
CFLAGS = -O2 -fpic --pedantic -I$(LUADIR) -I$(LPEGDIR) | |
CC = gcc | |
$(OUT): lpeg_utf8.o | |
$(CC) -O -shared -fpic `pkg-config --libs --cflags icu-uc` -o $(OUT) $< | |
lpeg_utf8.o: Makefile lpeg_utf8.c | |
test: test.lua $(OUT) | |
lua test.lua | |
clean: | |
rm *.o *.so |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
local lib = assert ( package.loadlib ( "./lpeg_utf8.so" , "luaopen_lpeg_utf8" ) ) | |
utf8 = lib() | |
local locale = utf8.locale() | |
local lpeg = require "lpeg" | |
local lpeg_locale = lpeg.locale() | |
assert(utf8.N(1):match("a") == 2) | |
assert(utf8.N(1):match("ฮ") == 3) | |
assert(utf8.N(2):match("aฮ") == 4) | |
assert(locale.lower:match("a")) | |
assert(not locale.upper:match("a")) | |
local ascii_lower = utf8.R"az" | |
assert(not ascii_lower:match("`")) | |
assert(ascii_lower:match("a")) | |
assert(ascii_lower:match("m")) | |
assert(ascii_lower:match("z")) | |
assert(not ascii_lower:match("{")) | |
local some_hieroglyphs = utf8.R"๐๐ญ" | |
assert(not some_hieroglyphs:match("๐")) | |
assert(some_hieroglyphs:match("๐")) | |
assert(some_hieroglyphs:match("๐ท")) | |
assert(some_hieroglyphs:match("๐ญ")) | |
assert(not some_hieroglyphs:match("๐ฎ")) | |
local pass = (lpeg.P('_') + locale.alnum) + lpeg.P('-') | |
assert(pass:match("foo_bar")) | |
local fail = (lpeg_locale.alnum + lpeg.P('_')) + lpeg.P('-') | |
assert(fail:match("foo_bar")) | |
local trim_space = lpeg.P(' ')^0 * lpeg.C(utf8.N(1)^0) | |
local input = 'foobar' | |
result = lpeg.match(trim_space, input) | |
print(input:byte(1,-1)) | |
print(result:byte(1,-1)) | |
assert(input == result) | |
print("TESTS ALL PASSED") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment