Last active
December 27, 2021 20:16
-
-
Save EgorBo/3bf1863a27bfe105054bf9f2cd6c70ef to your computer and use it in GitHub Desktop.
FasterHttpHeadersParse.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Diagnostics; | |
using System.Numerics; | |
using System.Runtime.CompilerServices; | |
using System.Runtime.InteropServices; | |
using System.Runtime.Intrinsics; | |
using System.Runtime.Intrinsics.X86; | |
using System.Text; | |
public class Prog | |
{ | |
private const byte ByteCR = (byte)'\r'; | |
private const byte ByteLF = (byte)'\n'; | |
private const byte ByteColon = (byte)':'; | |
private const byte ByteSpace = (byte)' '; | |
private const byte ByteTab = (byte)'\t'; | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static Vector128<byte> LoadVector128(ref byte start, nuint offset) | |
=> Unsafe.ReadUnaligned<Vector128<byte>>(ref Unsafe.AddByteOffset(ref start, offset)); | |
/// <summary> | |
/// </summary> | |
/// <param name="handler">to report final key and value</param> | |
/// <param name="span">data to process</param> | |
/// <returns>position of CRLF, -1 in case of invalid header</returns> | |
private static int TryTakeSingleHeaderFast(TRequestHandler handler, ReadOnlySpan<byte> span) | |
{ | |
Debug.Assert(span.Length >= 16 && Sse2.IsSupported); | |
// Rules: | |
// 1) ends with CRLF, doesn't contain CR or LF inside | |
// 2) key is not empty, should not contain ' ' and '\t' | |
// 3) value can be empty | |
// 4) there can be multiple ' ' and '\t' around value, e.g.: | |
// "key: \t\t value\t \r\n" | |
// and we must trim them when reporting key and value to handler | |
ref byte searchSpace = ref MemoryMarshal.GetReference(span); | |
nuint length = (nuint)span.Length; | |
Vector128<byte> vectorCR = Vector128.Create(ByteCR); | |
Vector128<byte> vectorLF = Vector128.Create(ByteLF); | |
Vector128<byte> vectorColon = Vector128.Create(ByteColon); | |
Vector128<byte> vectorSpace = Vector128.Create(ByteSpace); | |
Vector128<byte> vectorTab = Vector128.Create(ByteTab); | |
nuint colonPosition = 0; | |
nuint valueStarts = 0; | |
nuint offset = 0; | |
nuint finalLength = 0; | |
NEXT: | |
Vector128<byte> vector = LoadVector128(ref searchSpace, offset); | |
int colonIndex = Sse2.MoveMask(Sse2.CompareEqual(vector, vectorColon)); | |
Vector128<byte> cmpCR = Sse2.CompareEqual(vector, vectorCR); | |
Vector128<byte> cmpLF = Sse2.CompareEqual(vector, vectorLF); | |
Vector128<byte> cmpSpace = Sse2.CompareEqual(vector, vectorSpace); | |
Vector128<byte> cmpTab = Sse2.CompareEqual(vector, vectorTab); | |
// colonPosition is not found yet | |
if (colonPosition == 0) | |
{ | |
Vector128<byte> spaceOrTab = Sse2.Or(cmpSpace, cmpTab); | |
Vector128<byte> crOrLf = Sse2.Or(cmpCR, cmpLF); | |
int crOrLfOrSpaceOrTab = Sse2.MoveMask(Sse2.Or(crOrLf, spaceOrTab)); | |
// no colon in the current vector - most likely we still process header's name | |
if (colonIndex == 0) | |
{ | |
// we don't expect to see any of ' ', '\t', '\r', '\n' till we find ':' | |
if (crOrLfOrSpaceOrTab != 0) | |
{ | |
return -1; | |
} | |
offset += 16; | |
if (offset == length) | |
{ | |
// unexpected end of span | |
return -1; | |
} | |
if (offset + 16 > length) | |
{ | |
// not enough space for the next 128bit vector so let's | |
// overlap with the current one | |
offset = length - 16; | |
} | |
goto NEXT; | |
} | |
// we've found the ':' in the current vector, let's check where exactly | |
nuint colonLocalPos = (nuint)BitOperations.TrailingZeroCount(colonIndex); | |
// update colonPosition | |
colonPosition = offset + colonLocalPos; | |
// ':' is not expected to be the first symbol, also | |
// none of ' ', '\t', '\r', '\n' are expected to be found before ':' | |
if (colonPosition == 0 || colonLocalPos > (nuint)BitOperations.TrailingZeroCount(crOrLfOrSpaceOrTab)) | |
{ | |
return -1; | |
} | |
if (colonLocalPos == 15) | |
{ | |
offset += 16; | |
if (offset == length) | |
{ | |
// unexpected end of span | |
return -1; | |
} | |
if (offset + 16 > length) | |
{ | |
// not enough space for the next 128bit vector so let's | |
// overlap with the current one | |
offset = length - 16; | |
} | |
goto NEXT; | |
} | |
} | |
if (valueStarts == 0) | |
{ | |
uint nonWhiteSpace = ~(uint)Sse2.MoveMask(Sse2.Or(cmpSpace, cmpTab)); | |
// Vector still contains ':' separator so we need to ignore it and the part of the | |
// header's name | |
if (offset <= colonPosition) | |
{ | |
int toIgnore = (int)(colonPosition + 1 - offset); | |
nonWhiteSpace = (nonWhiteSpace >> toIgnore) << toIgnore; | |
} | |
var posOfFirstNonWhitespace = (nuint)BitOperations.TrailingZeroCount(nonWhiteSpace); | |
if (posOfFirstNonWhitespace == 16) | |
{ | |
offset += 16; | |
if (offset == length) | |
{ | |
// unexpected end of span | |
return -1; | |
} | |
if (offset + 16 > length) | |
{ | |
// not enough space for the next 128bit vector so let's | |
// overlap with the current one | |
offset = length - 16; | |
} | |
goto NEXT; | |
} | |
valueStarts = offset + posOfFirstNonWhitespace; | |
} | |
// Now let's wait for CRLF | |
nuint crPos = (nuint)Sse2.MoveMask(cmpCR); | |
if (crPos == 0) | |
{ | |
if ((nuint)Sse2.MoveMask(cmpLF) != 0) | |
return -1; | |
offset += 16; | |
if (offset == length) | |
{ | |
// unexpected end of span | |
return -1; | |
} | |
if (offset + 16 > length) | |
{ | |
// not enough space for the next 128bit vector so let's | |
// overlap with the current one | |
offset = length - 16; | |
} | |
goto NEXT; | |
} | |
crPos = (nuint)BitOperations.TrailingZeroCount(crPos); | |
if (crPos == 15) | |
{ | |
// Rare case: CR was found in the last byte of the current vector | |
// make sure current chunk doesn't have any LF and check the next byte directly | |
if (Sse2.MoveMask(cmpLF) != 0) // use TestZ here? | |
{ | |
return -1; | |
} | |
if (offset + 16 >= length || Unsafe.AddByteOffset(ref searchSpace, offset + 16) != ByteLF) | |
{ | |
// either unexpected end of span or next byte after CR is not LF | |
return -1; | |
} | |
finalLength = offset + 17; | |
goto END; | |
} | |
var lfPos = (nuint)BitOperations.TrailingZeroCount(Sse2.MoveMask(cmpLF)); | |
if (crPos + 1 != lfPos) | |
{ | |
// CR is either not followed by LF or LF was found earlier than expected | |
return -1; | |
} | |
finalLength = offset + lfPos + 1; | |
END: | |
// Compatibility: Non-fast version of parser gives up on inputs like "1:\r\n" (however, it's fine with "1: \r\n") | |
if (finalLength < 5) | |
return -1; | |
nuint valueEnds = finalLength - 3; // minus CRLF | |
// Rare case: trailing spaces or tabs | |
for (nuint i = valueEnds; i > valueStarts; i--) | |
{ | |
// for most cases we expect this loop to fire just a single iteration | |
var b = Unsafe.AddByteOffset(ref searchSpace, i); | |
if ((b != ByteSpace) && (b != ByteTab)) | |
{ | |
break; | |
} | |
valueEnds--; | |
} | |
handler.OnHeader(span.Slice(0, (int)colonPosition), span.Slice((int)valueStarts, (int)(valueEnds + 1 - valueStarts))); | |
return (int)finalLength; | |
} | |
static unsafe void Main() | |
{ | |
int length = TryTakeSingleHeaderFast(new TRequestHandler(), Encoding.UTF8.GetBytes("Accept-Encoding: gzip, deflate, br\r\n")); | |
Console.WriteLine(length); | |
Console.ReadKey(); | |
} | |
} | |
class TRequestHandler | |
{ | |
public void OnHeader(ReadOnlySpan<byte> name, ReadOnlySpan<byte> value) | |
{ | |
Console.WriteLine($"Name: (Len={name.Length}) {Encoding.UTF8.GetString(name)}"); | |
Console.WriteLine($"Value: (Len={value.Length}) {Encoding.UTF8.GetString(value)}"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment