Skip to content

Instantly share code, notes, and snippets.

@DmitryOlshansky
Created May 26, 2013 20:31
Show Gist options
  • Select an option

  • Save DmitryOlshansky/5653927 to your computer and use it in GitHub Desktop.

Select an option

Save DmitryOlshansky/5653927 to your computer and use it in GitHub Desktop.
Experimential UTF-8 stride calculation
import std.traits, std.utf : UTFException;
uint stride(S)(auto ref S str, size_t index)
if (is(S : const char[]) ||
(isRandomAccessRange!S && is(Unqual!(ElementType!S) == char)))
{
immutable c = str[index];
if (c < 0x80)
return 1;
else
return strideImpl(c, index);
}
private uint strideImpl(char c, size_t index)
in { assert(c & 0x80); }
body
{
import core.bitop;
immutable msbs = 7 - bsr(~c);
if (msbs < 2 || msbs > 6)
throw new UTFException("Invalid UTF-8 sequence", index);
return msbs;
}
size_t myStride(S)(auto ref S src, size_t idx)
if (is(S : const char[]) ||
(isRandomAccessRange!S && is(Unqual!(ElementType!S) == char)))
{
immutable c = src[idx];
if(c < 0x80)
return 1;
else
return myStrideImpl(c, idx);
}
size_t myStrideImpl(size_t c, size_t idx)
in
{
assert(c & 0x80);
}
body
{
enum mask = calcTable();
//pick bits 0b0xxx_0000, get xxx * 4
uint shift = (c & 0b0111_0000) >> 2;
auto ret = (mask >> shift) & 0xf;
if(ret == 0)
throw new UTFException("Invalid UTF-8 sequence", idx);
return ret;
}
uint calcTable()
{
uint mask = 0;
foreach(int top; 0..8) //3 bits
{
mask |= utfValue(top)<<(top*4);
}
return mask;
}
//UTF stride for 0b1xxx_.... where .... is anything, xxx is 3-bit val
int utfValue(int val)
{
switch(val){
case 0b111:
return 4;
case 0b110:
return 3;
case 0b100:
return 2;
case 0b101:
return 2;
default:
return 0; //wrong UTF-8 sequence
}
}
unittest
{
foreach(uint ch; 0..0x100)
{
char[1] buf;
buf[0] = ch;
int std, my;
bool stdThrown = false, myThrown = false;
try{
std = stride(buf, 0);
}
catch(UTFException e){
stdThrown = true;
}
try{
my = myStride(buf, 0);
}
catch(UTFException e){
myThrown = true;
}
assert((myThrown && stdThrown) || std == my);
}
}
import std.stdio, std.datetime, std.file, std.string;
int bench(alias fn)(char[] buffer)
{
int i;
StopWatch sw;
sw.start();
for(i=0; i<buffer.length; ){
i += fn(buffer, i);
}
sw.stop();
enum idx = fn.stringof.indexOf("(");
writefln("%s %s ", fn.stringof[0..idx], sw.peek().usecs);
return i;
}
int main(string argv[])
{
if(argv.length < 2)
{
writeln("Usage: ./fast_stride <file>");
return 1;
}
auto buffer = cast(char[])read(argv[1]);
int len, len2;
//test one order
len = bench!stride(buffer);
len2 = bench!myStride(buffer);
if(len != len2)
writeln("*** MISMATCH 1 ***");
len = bench!myStride(buffer);
len2 = bench!stride(buffer);
if(len != len2)
writeln("*** MISMATCH 2 ***");
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment