Created
June 25, 2024 20:38
-
-
Save GrabYourPitchforks/a3b1ec234fee3ffb4cf68ffa8e51904f to your computer and use it in GitHub Desktop.
UTF8 char count testing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Buffers; | |
using System.Diagnostics; | |
using System.IO; | |
using System.Runtime.InteropServices; | |
using System.Runtime.Intrinsics; | |
using System.Runtime.Intrinsics.X86; | |
using System.Text; | |
using System.Text.Unicode; | |
using BenchmarkDotNet.Attributes; | |
using BenchmarkDotNet.Jobs; | |
using BenchmarkDotNet.Running; | |
BenchmarkRunner.Run<Utf8Experimental>(); | |
[SimpleJob(RuntimeMoniker.Net80)] | |
public unsafe class Utf8Experimental | |
{ | |
private byte[] _data; | |
[GlobalSetup] | |
public void Setup() | |
{ | |
_data = File.ReadAllBytes("twitter.json"); | |
} | |
[Benchmark(Baseline = true)] | |
public nuint CountChars_Net8() | |
{ | |
return (uint)Encoding.UTF8.GetCharCount(_data); | |
} | |
// [Benchmark(Baseline = true)] | |
public string GetString_Net8() | |
{ | |
return (string)Encoding.UTF8.GetString(_data); | |
} | |
[Benchmark] | |
public nuint CountChars_Mod_B() | |
{ | |
return (uint)CountChars_AssumeValid_Entry_B(_data); | |
} | |
// [Benchmark] | |
public string GetString_Mod_B() | |
{ | |
nuint stringLength = CountChars_AssumeValid_Entry_B(_data); | |
if (stringLength <= (uint)_data.Length) | |
{ | |
bool success = false; | |
string str = string.Create((int)stringLength, (data: _data, successPtr: (IntPtr)(&success)), static (chars, state) => | |
{ | |
var opStatus = Utf8.ToUtf16(state.data, chars, out int bytesRead, out int charsWritten, replaceInvalidSequences: false); | |
*((bool*)state.successPtr) = (opStatus == OperationStatus.Done) && (bytesRead == state.data.Length) && (charsWritten == chars.Length); | |
}); | |
if (success) | |
{ | |
return str; | |
} | |
} | |
// something went wrong - fall back to old logic instead | |
return Encoding.UTF8.GetString(_data); | |
} | |
private static nuint CountChars_AssumeValid_Entry_B(ReadOnlySpan<byte> buffer) | |
{ | |
if (buffer.IsEmpty) | |
{ | |
return 0; | |
} | |
fixed (byte* pData = &MemoryMarshal.GetReference(buffer)) | |
{ | |
return CountChars_AssumeValid_Avx2_B(pData, (uint)buffer.Length); | |
} | |
} | |
private static nuint CountChars_AssumeValid_Avx2_B(byte* pbData, nuint cbData) | |
{ | |
Debug.Assert(pbData != null); | |
Debug.Assert(cbData > 0); | |
Debug.Assert(Avx2.X64.IsSupported); | |
Debug.Assert(Popcnt.X64.IsSupported); | |
// General logic: all bytes in the ranges [00..7F] and [C0..EF] should | |
// result in 1 char being generated, since they all correspond to ASCII | |
// chars or lead bytes corresponding to BMP chars. All bytes in the | |
// range [F0..FF] should result in 2 chars being generated since they | |
// are lead bytes corresponding to the astral planes. | |
Vector256<byte> vecC0 = Vector256.Create((byte)0xC0); | |
Vector256<byte> vec70 = Vector256.Create((byte)0x70); | |
nuint cumulativeUtf16Chars = 0; | |
// Read the first byte or more of data. This also has potential to read | |
// data before the start of the buffer, which we'll discard. | |
byte* pAlignedReadStart = (byte*)((nuint)pbData & ~(nuint)(Vector256<byte>.Count - 1)); | |
byte* pAlignedReadEnd = (byte*)((nuint)(pbData + cbData) /* can't integer overflow due to C memory addressing rules */ & ~(nuint)(Vector256<byte>.Count - 1)); | |
Vector256<byte> thisStripe = Avx2.LoadAlignedVector256(pAlignedReadStart); | |
uint nonContinuationBytesBitmap = (uint)Avx2.MoveMask(Avx2.CompareGreaterThan(thisStripe.AsSByte(), vecC0.AsSByte())); | |
uint astralLeadBytesBitmap = (uint)Avx2.MoveMask(Avx2.SubtractSaturate(thisStripe, vec70)); | |
int numPrefixBytesToDiscard = (int)pbData & (Vector256<byte>.Count - 1); | |
uint startMask = unchecked((uint)(-1)) << numPrefixBytesToDiscard; | |
// Do any suffix bytes need to be discarded? | |
byte* pJustPastEndOfData = pbData + cbData; | |
int numSuffixBytesToDiscard = -(int)pJustPastEndOfData & (Vector256<byte>.Count - 1); // could be 0 | |
uint finalMask = (unchecked((uint)(-1)) << numSuffixBytesToDiscard) >> numSuffixBytesToDiscard; | |
if (pAlignedReadStart == pAlignedReadEnd) | |
{ | |
nonContinuationBytesBitmap &= finalMask; | |
astralLeadBytesBitmap &= finalMask; | |
} | |
nonContinuationBytesBitmap &= startMask; | |
cumulativeUtf16Chars += (uint)Popcnt.PopCount(nonContinuationBytesBitmap); | |
astralLeadBytesBitmap &= startMask; | |
cumulativeUtf16Chars += (uint)Popcnt.PopCount(astralLeadBytesBitmap); | |
// Now read the rest of the data in a loop. | |
while (pAlignedReadStart < pAlignedReadEnd) | |
{ | |
thisStripe = Avx2.LoadAlignedVector256(pAlignedReadStart + Vector256<byte>.Count); | |
pAlignedReadStart += Vector256<byte>.Count; | |
nonContinuationBytesBitmap = (uint)Avx2.MoveMask(Avx2.CompareGreaterThan(thisStripe.AsSByte(), vecC0.AsSByte())); | |
astralLeadBytesBitmap = (uint)Avx2.MoveMask(Avx2.SubtractSaturate(thisStripe, vec70)); | |
// If we just ran past the end of the buffer, discard the extra bytes. | |
if (pAlignedReadStart == pAlignedReadEnd) | |
{ | |
nonContinuationBytesBitmap &= finalMask; | |
astralLeadBytesBitmap &= finalMask; | |
} | |
cumulativeUtf16Chars += (uint)Popcnt.PopCount(nonContinuationBytesBitmap); | |
cumulativeUtf16Chars += (uint)Popcnt.PopCount(astralLeadBytesBitmap); | |
} | |
return cumulativeUtf16Chars; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment