Last active
February 3, 2021 23:43
-
-
Save hypeartist/61d13aa996e6bc5316057235d807ed2c to your computer and use it in GitHub Desktop.
Generate set of random UTF8 chars
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public struct GenStats | |
{ | |
public int OneByteCharCount; | |
public int TwoBytesCharCount; | |
public int ThreeBytesCharCount; | |
public int FourBytesCharCount; | |
public int OneCharSeqCount; | |
public int TwoCharsSeqCount; | |
public int Size; | |
public int CharCount; | |
} | |
[Flags] | |
public enum GenVar | |
{ | |
OneByte = 1, | |
TwoBytes = 2, | |
ThreeBytes = 4, | |
FourBytes = 8, | |
Any = OneByte | TwoBytes | ThreeBytes | FourBytes | |
} | |
private struct Utf8Def | |
{ | |
public byte Mask; | |
public int BitsStored; | |
} | |
private static ReadOnlySpan<Utf8Def> UtfDefs => new Utf8Def[] | |
{ | |
new(){Mask = 0b00111111, BitsStored = 6}, | |
new(){Mask = 0b01111111, BitsStored = 7}, | |
new(){Mask = 0b00011111, BitsStored = 5}, | |
new(){Mask = 0b00001111, BitsStored = 4}, | |
new(){Mask = 0b00000111, BitsStored = 3}, | |
}; | |
private static bool CheckUtf8(byte* s) | |
{ | |
if (*s < 0x80) | |
{ | |
return true; | |
} | |
if ((s[0] & 0xe0) == 0xc0) | |
{ | |
return (s[1] & 0xc0) == 0x80 && (s[0] & 0xfe) != 0xc0; | |
} | |
if ((s[0] & 0xf0) == 0xe0) | |
{ | |
return (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80 && (s[0] != 0xe0 || (s[1] & 0xe0) != 0x80) && (s[0] != 0xed || (s[1] & 0xe0) != 0xa0) && (s[0] != 0xef || s[1] != 0xbf || (s[2] & 0xfe) != 0xbe); | |
} | |
if ((s[0] & 0xf8) == 0xf0) | |
{ | |
return (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80 && (s[0] != 0xf0 || (s[1] & 0xf0) != 0x80) && (s[0] != 0xf4 || s[1] <= 0x8f) && s[0] <= 0xf4; | |
} | |
return false; | |
} | |
private static GenStats GenerateUtf8Impl(int count, byte* buf, GenVar gv) | |
{ | |
GenStats gs = default; | |
while (count != 0) | |
{ | |
var charByteCount = Random(1, 4); | |
switch (charByteCount) | |
{ | |
case 1 when !gv.HasFlag(GenVar.OneByte): | |
case 2 when !gv.HasFlag(GenVar.TwoBytes): | |
case 3 when !gv.HasFlag(GenVar.ThreeBytes): | |
case 4 when !gv.HasFlag(GenVar.FourBytes): | |
continue; | |
} | |
if (charByteCount == 1) | |
{ | |
buf[0] = (byte) Random(0x00, 0x7F); | |
if (!CheckUtf8(buf)) | |
{ | |
throw new("Invalid"); | |
} | |
gs.OneByteCharCount++; | |
} | |
else if (charByteCount == 2) | |
{ | |
buf[0] = (byte) Random(0xC2, 0xDF); | |
buf[1] = (byte) Random(0x80, 0xBF); | |
if (!CheckUtf8(buf)) | |
{ | |
throw new("Invalid"); | |
} | |
gs.TwoBytesCharCount++; | |
} | |
else if (charByteCount == 3) | |
{ | |
buf[0] = (byte) Random(0xE0, 0xEF); | |
buf[1] = buf[0] switch | |
{ | |
0xE0 => (byte) Random(0xA0, 0xBF), | |
0xED => (byte) Random(0x80, 0x9F), | |
0xEF => (byte) Random(0x80, 0xBE), | |
_ => (byte) Random(0x80, 0xBF) | |
}; | |
buf[2] = (byte) Random(0x80, 0xBF); | |
if (!CheckUtf8(buf)) | |
{ | |
throw new("Invalid"); | |
} | |
gs.ThreeBytesCharCount++; | |
} | |
else if (charByteCount == 4) | |
{ | |
buf[0] = (byte) Random(0xF0, 0xF4); | |
buf[1] = buf[0] switch | |
{ | |
0xF0 => (byte) Random(0x90, 0xBF), | |
0xF4 => (byte) Random(0x80, 0x8F), | |
_ => (byte) Random(0x80, 0xBF) | |
}; | |
buf[2] = (byte) Random(0x80, 0xBF); | |
buf[3] = (byte) Random(0x80, 0xBF); | |
if (!CheckUtf8(buf)) | |
{ | |
throw new("Invalid"); | |
} | |
gs.FourBytesCharCount++; | |
} | |
var tmp = buf; | |
var shift = UtfDefs[0].BitsStored * (charByteCount - 1); | |
var codepoint = (*tmp++ & UtfDefs[charByteCount].Mask) << shift; | |
for (var i = 1; i < charByteCount; ++i, ++tmp) | |
{ | |
shift -= UtfDefs[0].BitsStored; | |
codepoint |= ((char) *tmp & UtfDefs[0].Mask) << shift; | |
} | |
count--; | |
if (codepoint <= 0xFFFF) | |
{ | |
gs.OneCharSeqCount++; | |
gs.CharCount++; | |
} | |
else | |
{ | |
gs.TwoCharsSeqCount++; | |
gs.CharCount += 2; | |
} | |
gs.Size += charByteCount; | |
buf += charByteCount; | |
} | |
return gs; | |
} | |
private static readonly Random R = new(); | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static int Random(int x1, int x2) | |
{ | |
var r = R.Next() & 0x7FFF; | |
return ((r * (x2 - x1 + 1)) >> 15) + x1; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment