Skip to content

Instantly share code, notes, and snippets.

@hypeartist
Last active February 3, 2021 23:43
Show Gist options
  • Save hypeartist/61d13aa996e6bc5316057235d807ed2c to your computer and use it in GitHub Desktop.
Save hypeartist/61d13aa996e6bc5316057235d807ed2c to your computer and use it in GitHub Desktop.
Generate set of random UTF8 chars
public struct GenStats
{
public int OneByteCharCount;
public int TwoBytesCharCount;
public int ThreeBytesCharCount;
public int FourBytesCharCount;
public int OneCharSeqCount;
public int TwoCharsSeqCount;
public int Size;
public int CharCount;
}
[Flags]
public enum GenVar
{
OneByte = 1,
TwoBytes = 2,
ThreeBytes = 4,
FourBytes = 8,
Any = OneByte | TwoBytes | ThreeBytes | FourBytes
}
private struct Utf8Def
{
public byte Mask;
public int BitsStored;
}
private static ReadOnlySpan<Utf8Def> UtfDefs => new Utf8Def[]
{
new(){Mask = 0b00111111, BitsStored = 6},
new(){Mask = 0b01111111, BitsStored = 7},
new(){Mask = 0b00011111, BitsStored = 5},
new(){Mask = 0b00001111, BitsStored = 4},
new(){Mask = 0b00000111, BitsStored = 3},
};
private static bool CheckUtf8(byte* s)
{
if (*s < 0x80)
{
return true;
}
if ((s[0] & 0xe0) == 0xc0)
{
return (s[1] & 0xc0) == 0x80 && (s[0] & 0xfe) != 0xc0;
}
if ((s[0] & 0xf0) == 0xe0)
{
return (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80 && (s[0] != 0xe0 || (s[1] & 0xe0) != 0x80) && (s[0] != 0xed || (s[1] & 0xe0) != 0xa0) && (s[0] != 0xef || s[1] != 0xbf || (s[2] & 0xfe) != 0xbe);
}
if ((s[0] & 0xf8) == 0xf0)
{
return (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80 && (s[0] != 0xf0 || (s[1] & 0xf0) != 0x80) && (s[0] != 0xf4 || s[1] <= 0x8f) && s[0] <= 0xf4;
}
return false;
}
private static GenStats GenerateUtf8Impl(int count, byte* buf, GenVar gv)
{
GenStats gs = default;
while (count != 0)
{
var charByteCount = Random(1, 4);
switch (charByteCount)
{
case 1 when !gv.HasFlag(GenVar.OneByte):
case 2 when !gv.HasFlag(GenVar.TwoBytes):
case 3 when !gv.HasFlag(GenVar.ThreeBytes):
case 4 when !gv.HasFlag(GenVar.FourBytes):
continue;
}
if (charByteCount == 1)
{
buf[0] = (byte) Random(0x00, 0x7F);
if (!CheckUtf8(buf))
{
throw new("Invalid");
}
gs.OneByteCharCount++;
}
else if (charByteCount == 2)
{
buf[0] = (byte) Random(0xC2, 0xDF);
buf[1] = (byte) Random(0x80, 0xBF);
if (!CheckUtf8(buf))
{
throw new("Invalid");
}
gs.TwoBytesCharCount++;
}
else if (charByteCount == 3)
{
buf[0] = (byte) Random(0xE0, 0xEF);
buf[1] = buf[0] switch
{
0xE0 => (byte) Random(0xA0, 0xBF),
0xED => (byte) Random(0x80, 0x9F),
0xEF => (byte) Random(0x80, 0xBE),
_ => (byte) Random(0x80, 0xBF)
};
buf[2] = (byte) Random(0x80, 0xBF);
if (!CheckUtf8(buf))
{
throw new("Invalid");
}
gs.ThreeBytesCharCount++;
}
else if (charByteCount == 4)
{
buf[0] = (byte) Random(0xF0, 0xF4);
buf[1] = buf[0] switch
{
0xF0 => (byte) Random(0x90, 0xBF),
0xF4 => (byte) Random(0x80, 0x8F),
_ => (byte) Random(0x80, 0xBF)
};
buf[2] = (byte) Random(0x80, 0xBF);
buf[3] = (byte) Random(0x80, 0xBF);
if (!CheckUtf8(buf))
{
throw new("Invalid");
}
gs.FourBytesCharCount++;
}
var tmp = buf;
var shift = UtfDefs[0].BitsStored * (charByteCount - 1);
var codepoint = (*tmp++ & UtfDefs[charByteCount].Mask) << shift;
for (var i = 1; i < charByteCount; ++i, ++tmp)
{
shift -= UtfDefs[0].BitsStored;
codepoint |= ((char) *tmp & UtfDefs[0].Mask) << shift;
}
count--;
if (codepoint <= 0xFFFF)
{
gs.OneCharSeqCount++;
gs.CharCount++;
}
else
{
gs.TwoCharsSeqCount++;
gs.CharCount += 2;
}
gs.Size += charByteCount;
buf += charByteCount;
}
return gs;
}
private static readonly Random R = new();
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Random(int x1, int x2)
{
var r = R.Next() & 0x7FFF;
return ((r * (x2 - x1 + 1)) >> 15) + x1;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment