Created
May 26, 2013 20:31
-
-
Save DmitryOlshansky/5653927 to your computer and use it in GitHub Desktop.
Experimential UTF-8 stride calculation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import std.traits, std.utf : UTFException; | |
| uint stride(S)(auto ref S str, size_t index) | |
| if (is(S : const char[]) || | |
| (isRandomAccessRange!S && is(Unqual!(ElementType!S) == char))) | |
| { | |
| immutable c = str[index]; | |
| if (c < 0x80) | |
| return 1; | |
| else | |
| return strideImpl(c, index); | |
| } | |
| private uint strideImpl(char c, size_t index) | |
| in { assert(c & 0x80); } | |
| body | |
| { | |
| import core.bitop; | |
| immutable msbs = 7 - bsr(~c); | |
| if (msbs < 2 || msbs > 6) | |
| throw new UTFException("Invalid UTF-8 sequence", index); | |
| return msbs; | |
| } | |
| size_t myStride(S)(auto ref S src, size_t idx) | |
| if (is(S : const char[]) || | |
| (isRandomAccessRange!S && is(Unqual!(ElementType!S) == char))) | |
| { | |
| immutable c = src[idx]; | |
| if(c < 0x80) | |
| return 1; | |
| else | |
| return myStrideImpl(c, idx); | |
| } | |
| size_t myStrideImpl(size_t c, size_t idx) | |
| in | |
| { | |
| assert(c & 0x80); | |
| } | |
| body | |
| { | |
| enum mask = calcTable(); | |
| //pick bits 0b0xxx_0000, get xxx * 4 | |
| uint shift = (c & 0b0111_0000) >> 2; | |
| auto ret = (mask >> shift) & 0xf; | |
| if(ret == 0) | |
| throw new UTFException("Invalid UTF-8 sequence", idx); | |
| return ret; | |
| } | |
| uint calcTable() | |
| { | |
| uint mask = 0; | |
| foreach(int top; 0..8) //3 bits | |
| { | |
| mask |= utfValue(top)<<(top*4); | |
| } | |
| return mask; | |
| } | |
| //UTF stride for 0b1xxx_.... where .... is anything, xxx is 3-bit val | |
| int utfValue(int val) | |
| { | |
| switch(val){ | |
| case 0b111: | |
| return 4; | |
| case 0b110: | |
| return 3; | |
| case 0b100: | |
| return 2; | |
| case 0b101: | |
| return 2; | |
| default: | |
| return 0; //wrong UTF-8 sequence | |
| } | |
| } | |
| unittest | |
| { | |
| foreach(uint ch; 0..0x100) | |
| { | |
| char[1] buf; | |
| buf[0] = ch; | |
| int std, my; | |
| bool stdThrown = false, myThrown = false; | |
| try{ | |
| std = stride(buf, 0); | |
| } | |
| catch(UTFException e){ | |
| stdThrown = true; | |
| } | |
| try{ | |
| my = myStride(buf, 0); | |
| } | |
| catch(UTFException e){ | |
| myThrown = true; | |
| } | |
| assert((myThrown && stdThrown) || std == my); | |
| } | |
| } | |
| import std.stdio, std.datetime, std.file, std.string; | |
| int bench(alias fn)(char[] buffer) | |
| { | |
| int i; | |
| StopWatch sw; | |
| sw.start(); | |
| for(i=0; i<buffer.length; ){ | |
| i += fn(buffer, i); | |
| } | |
| sw.stop(); | |
| enum idx = fn.stringof.indexOf("("); | |
| writefln("%s %s ", fn.stringof[0..idx], sw.peek().usecs); | |
| return i; | |
| } | |
| int main(string argv[]) | |
| { | |
| if(argv.length < 2) | |
| { | |
| writeln("Usage: ./fast_stride <file>"); | |
| return 1; | |
| } | |
| auto buffer = cast(char[])read(argv[1]); | |
| int len, len2; | |
| //test one order | |
| len = bench!stride(buffer); | |
| len2 = bench!myStride(buffer); | |
| if(len != len2) | |
| writeln("*** MISMATCH 1 ***"); | |
| len = bench!myStride(buffer); | |
| len2 = bench!stride(buffer); | |
| if(len != len2) | |
| writeln("*** MISMATCH 2 ***"); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment