Skip to content

Instantly share code, notes, and snippets.

@EgorBo
Last active December 27, 2021 20:16
Show Gist options
  • Save EgorBo/3bf1863a27bfe105054bf9f2cd6c70ef to your computer and use it in GitHub Desktop.
Save EgorBo/3bf1863a27bfe105054bf9f2cd6c70ef to your computer and use it in GitHub Desktop.
FasterHttpHeadersParse.cs
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Text;
public class Prog
{
private const byte ByteCR = (byte)'\r';
private const byte ByteLF = (byte)'\n';
private const byte ByteColon = (byte)':';
private const byte ByteSpace = (byte)' ';
private const byte ByteTab = (byte)'\t';
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<byte> LoadVector128(ref byte start, nuint offset)
=> Unsafe.ReadUnaligned<Vector128<byte>>(ref Unsafe.AddByteOffset(ref start, offset));
/// <summary>
/// </summary>
/// <param name="handler">to report final key and value</param>
/// <param name="span">data to process</param>
/// <returns>position of CRLF, -1 in case of invalid header</returns>
private static int TryTakeSingleHeaderFast(TRequestHandler handler, ReadOnlySpan<byte> span)
{
Debug.Assert(span.Length >= 16 && Sse2.IsSupported);
// Rules:
// 1) ends with CRLF, doesn't contain CR or LF inside
// 2) key is not empty, should not contain ' ' and '\t'
// 3) value can be empty
// 4) there can be multiple ' ' and '\t' around value, e.g.:
// "key: \t\t value\t \r\n"
// and we must trim them when reporting key and value to handler
ref byte searchSpace = ref MemoryMarshal.GetReference(span);
nuint length = (nuint)span.Length;
Vector128<byte> vectorCR = Vector128.Create(ByteCR);
Vector128<byte> vectorLF = Vector128.Create(ByteLF);
Vector128<byte> vectorColon = Vector128.Create(ByteColon);
Vector128<byte> vectorSpace = Vector128.Create(ByteSpace);
Vector128<byte> vectorTab = Vector128.Create(ByteTab);
nuint colonPosition = 0;
nuint valueStarts = 0;
nuint offset = 0;
nuint finalLength = 0;
NEXT:
Vector128<byte> vector = LoadVector128(ref searchSpace, offset);
int colonIndex = Sse2.MoveMask(Sse2.CompareEqual(vector, vectorColon));
Vector128<byte> cmpCR = Sse2.CompareEqual(vector, vectorCR);
Vector128<byte> cmpLF = Sse2.CompareEqual(vector, vectorLF);
Vector128<byte> cmpSpace = Sse2.CompareEqual(vector, vectorSpace);
Vector128<byte> cmpTab = Sse2.CompareEqual(vector, vectorTab);
// colonPosition is not found yet
if (colonPosition == 0)
{
Vector128<byte> spaceOrTab = Sse2.Or(cmpSpace, cmpTab);
Vector128<byte> crOrLf = Sse2.Or(cmpCR, cmpLF);
int crOrLfOrSpaceOrTab = Sse2.MoveMask(Sse2.Or(crOrLf, spaceOrTab));
// no colon in the current vector - most likely we still process header's name
if (colonIndex == 0)
{
// we don't expect to see any of ' ', '\t', '\r', '\n' till we find ':'
if (crOrLfOrSpaceOrTab != 0)
{
return -1;
}
offset += 16;
if (offset == length)
{
// unexpected end of span
return -1;
}
if (offset + 16 > length)
{
// not enough space for the next 128bit vector so let's
// overlap with the current one
offset = length - 16;
}
goto NEXT;
}
// we've found the ':' in the current vector, let's check where exactly
nuint colonLocalPos = (nuint)BitOperations.TrailingZeroCount(colonIndex);
// update colonPosition
colonPosition = offset + colonLocalPos;
// ':' is not expected to be the first symbol, also
// none of ' ', '\t', '\r', '\n' are expected to be found before ':'
if (colonPosition == 0 || colonLocalPos > (nuint)BitOperations.TrailingZeroCount(crOrLfOrSpaceOrTab))
{
return -1;
}
if (colonLocalPos == 15)
{
offset += 16;
if (offset == length)
{
// unexpected end of span
return -1;
}
if (offset + 16 > length)
{
// not enough space for the next 128bit vector so let's
// overlap with the current one
offset = length - 16;
}
goto NEXT;
}
}
if (valueStarts == 0)
{
uint nonWhiteSpace = ~(uint)Sse2.MoveMask(Sse2.Or(cmpSpace, cmpTab));
// Vector still contains ':' separator so we need to ignore it and the part of the
// header's name
if (offset <= colonPosition)
{
int toIgnore = (int)(colonPosition + 1 - offset);
nonWhiteSpace = (nonWhiteSpace >> toIgnore) << toIgnore;
}
var posOfFirstNonWhitespace = (nuint)BitOperations.TrailingZeroCount(nonWhiteSpace);
if (posOfFirstNonWhitespace == 16)
{
offset += 16;
if (offset == length)
{
// unexpected end of span
return -1;
}
if (offset + 16 > length)
{
// not enough space for the next 128bit vector so let's
// overlap with the current one
offset = length - 16;
}
goto NEXT;
}
valueStarts = offset + posOfFirstNonWhitespace;
}
// Now let's wait for CRLF
nuint crPos = (nuint)Sse2.MoveMask(cmpCR);
if (crPos == 0)
{
if ((nuint)Sse2.MoveMask(cmpLF) != 0)
return -1;
offset += 16;
if (offset == length)
{
// unexpected end of span
return -1;
}
if (offset + 16 > length)
{
// not enough space for the next 128bit vector so let's
// overlap with the current one
offset = length - 16;
}
goto NEXT;
}
crPos = (nuint)BitOperations.TrailingZeroCount(crPos);
if (crPos == 15)
{
// Rare case: CR was found in the last byte of the current vector
// make sure current chunk doesn't have any LF and check the next byte directly
if (Sse2.MoveMask(cmpLF) != 0) // use TestZ here?
{
return -1;
}
if (offset + 16 >= length || Unsafe.AddByteOffset(ref searchSpace, offset + 16) != ByteLF)
{
// either unexpected end of span or next byte after CR is not LF
return -1;
}
finalLength = offset + 17;
goto END;
}
var lfPos = (nuint)BitOperations.TrailingZeroCount(Sse2.MoveMask(cmpLF));
if (crPos + 1 != lfPos)
{
// CR is either not followed by LF or LF was found earlier than expected
return -1;
}
finalLength = offset + lfPos + 1;
END:
// Compatibility: Non-fast version of parser gives up on inputs like "1:\r\n" (however, it's fine with "1: \r\n")
if (finalLength < 5)
return -1;
nuint valueEnds = finalLength - 3; // minus CRLF
// Rare case: trailing spaces or tabs
for (nuint i = valueEnds; i > valueStarts; i--)
{
// for most cases we expect this loop to fire just a single iteration
var b = Unsafe.AddByteOffset(ref searchSpace, i);
if ((b != ByteSpace) && (b != ByteTab))
{
break;
}
valueEnds--;
}
handler.OnHeader(span.Slice(0, (int)colonPosition), span.Slice((int)valueStarts, (int)(valueEnds + 1 - valueStarts)));
return (int)finalLength;
}
static unsafe void Main()
{
int length = TryTakeSingleHeaderFast(new TRequestHandler(), Encoding.UTF8.GetBytes("Accept-Encoding: gzip, deflate, br\r\n"));
Console.WriteLine(length);
Console.ReadKey();
}
}
class TRequestHandler
{
public void OnHeader(ReadOnlySpan<byte> name, ReadOnlySpan<byte> value)
{
Console.WriteLine($"Name: (Len={name.Length}) {Encoding.UTF8.GetString(name)}");
Console.WriteLine($"Value: (Len={value.Length}) {Encoding.UTF8.GetString(value)}");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment