hinrik · June 18, 2012 18:18
diff --git a/lpeg_utf8_locale.lua b/lpeg_utf8_locale.lua
 local lpeg = require 'lpeg'
 local U    = require 'icu.ustring'
 local re   = require 'icu.regex'

 local utf8_codepoint
 do
  -- decode a two-byte UTF-8 sequence
  local function f2 (s)
    local c1, c2 = string.byte(s, 1, 2)
    return c1 * 64 + c2 - 12416
  end

  -- decode a three-byte UTF-8 sequence
  local function f3 (s)
    local c1, c2, c3 = string.byte(s, 1, 3)
    return (c1 * 64 + c2) * 64 + c3 - 925824
  end

  -- decode a four-byte UTF-8 sequence
  local function f4 (s)
    local c1, c2, c3, c4 = string.byte(s, 1, 4)
    return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
  end

  local cont = lpeg.R("\128\191")   -- continuation byte

  utf8_codepoint = lpeg.R("\0\127") / string.byte
    + lpeg.R("\194\223") * cont / f2
    + lpeg.R("\224\239") * cont * cont / f3
    + lpeg.R("\240\244") * cont * cont * cont / f4
 end

 local alnum = re.compile('^\\p{alnum}$')
 local alpha = re.compile('^\\p{alpha}$')
 local cntrl = re.compile('^\\p{cntrl}$')
 local digit = re.compile('^\\p{digit}$')
 local graph = re.compile('^\\p{graph}$')
 local lower = re.compile('^\\p{lower}$')
 local print = re.compile('^\\p{print}$')
 local punct = re.compile('^\\p{punct}$')
 local space = re.compile('^\\p{space}$')
 local upper = re.compile('^\\p{upper}$')
 local xdigit = re.compile('^\\p{xdigit}$')

 return {
  alnum = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(alnum, U.char(c)) end ) ;
  alpha = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(alpha, U.char(c)) end ) ;
  cntrl = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(cntrl, U.char(c)) end ) ;
  digit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(digit, U.char(c)) end ) ;
  graph = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(graph, U.char(c)) end ) ;
  lower = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(lower, U.char(c)) end ) ;
  print = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(print, U.char(c)) end ) ;
  punct = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(punct, U.char(c)) end ) ;
  space = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(space, U.char(c)) end ) ;
  upper = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(upper, U.char(c)) end ) ;
  xdigit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(digit, U.char(c)) end ) ;
 }
	local lpeg = require 'lpeg'
	local U = require 'icu.ustring'
	local re = require 'icu.regex'

	local utf8_codepoint
	do
	-- decode a two-byte UTF-8 sequence
	local function f2 (s)
	local c1, c2 = string.byte(s, 1, 2)
	return c1 * 64 + c2 - 12416
	end

	-- decode a three-byte UTF-8 sequence
	local function f3 (s)
	local c1, c2, c3 = string.byte(s, 1, 3)
	return (c1 * 64 + c2) * 64 + c3 - 925824
	end

	-- decode a four-byte UTF-8 sequence
	local function f4 (s)
	local c1, c2, c3, c4 = string.byte(s, 1, 4)
	return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
	end

	local cont = lpeg.R("\128\191") -- continuation byte

	utf8_codepoint = lpeg.R("\0\127") / string.byte
	+ lpeg.R("\194\223") * cont / f2
	+ lpeg.R("\224\239") * cont * cont / f3
	+ lpeg.R("\240\244") * cont * cont * cont / f4
	end

	local alnum = re.compile('^\\p{alnum}$')
	local alpha = re.compile('^\\p{alpha}$')
	local cntrl = re.compile('^\\p{cntrl}$')
	local digit = re.compile('^\\p{digit}$')
	local graph = re.compile('^\\p{graph}$')
	local lower = re.compile('^\\p{lower}$')
	local print = re.compile('^\\p{print}$')
	local punct = re.compile('^\\p{punct}$')
	local space = re.compile('^\\p{space}$')
	local upper = re.compile('^\\p{upper}$')
	local xdigit = re.compile('^\\p{xdigit}$')

	return {
	alnum = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(alnum, U.char(c)) end ) ;
	alpha = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(alpha, U.char(c)) end ) ;
	cntrl = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(cntrl, U.char(c)) end ) ;
	digit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(digit, U.char(c)) end ) ;
	graph = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(graph, U.char(c)) end ) ;
	lower = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(lower, U.char(c)) end ) ;
	print = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(print, U.char(c)) end ) ;
	punct = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(punct, U.char(c)) end ) ;
	space = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(space, U.char(c)) end ) ;
	upper = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(upper, U.char(c)) end ) ;
	xdigit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(digit, U.char(c)) end ) ;
	}
No results found