Last active
December 30, 2015 08:29
-
-
Save vrld/7803140 to your computer and use it in GitHub Desktop.
Simple, non-standard compliant UTF8 iterator for vanilla Lua
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
local function utf8_iter(s, i) | |
if i >= #s then return end | |
local b, nbytes = s:byte(i+1,i+1), 1 | |
-- determine width of the codepoint by counting the number if set bits | |
-- from top to bottom. not 100% to standard, but it works well enough | |
if b/4 >= 63 then nbytes = 6 | |
elseif b/8 >= 31 then nbytes = 5 | |
elseif b/16 >= 15 then nbytes = 4 | |
elseif b/32 >= 7 then nbytes = 3 | |
elseif b/64 >= 3 then nbytes = 2 | |
end | |
return i+nbytes, s:sub(i+1,i+nbytes), nbytes | |
end | |
local function iutf8(s) | |
return utf8_iter, s, 0 | |
end | |
str = 'abcdeäÄ«ÖÓÚÜöüßå' | |
print(str) --> abcdeäÄ«ÖÓÚÜöüßå | |
-- i is the position of the *last* byte of the character | |
-- c contains the unicode character | |
-- nbytes is the width (in bytes) of the character | |
for i, c, nbytes in iutf8(str) do | |
print(i, c, nbytes) | |
end | |
-- output of the loop: | |
-- 1 a 1 | |
-- 2 b 1 | |
-- 3 c 1 | |
-- 4 d 1 | |
-- 5 e 1 | |
-- 7 ä 2 | |
-- 9 Ä 2 | |
-- 11 « 2 | |
-- 13 Ö 2 | |
-- 15 Ó 2 | |
-- 17 Ú 2 | |
-- 19 Ü 2 | |
-- 21 ö 2 | |
-- 23 ü 2 | |
-- 25 ß 2 | |
-- 27 å 2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment