Created
March 15, 2023 18:28
-
-
Save andrewmd5/0532fbe2ccd85e504abcd923d8eb94b3 to your computer and use it in GitHub Desktop.
Detect if a byte-array or ReadOnlySpan<byte> contains only ASCII characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Buffers; | |
using System.Runtime.InteropServices; | |
using System.Runtime.Intrinsics; | |
using System.Runtime.Intrinsics.Arm; | |
using System.Runtime.Intrinsics.X86; | |
using System.Text; | |
namespace idk | |
{ | |
/// <summary> | |
/// AsciiChecker is a utility class that provides cross-platform support (x86, ARM, ARM64, and software) | |
/// for detecting if a byte-array or ReadOnlySpan<byte> contains only ASCII characters. | |
/// </summary> | |
public static class AsciiChecker | |
{ | |
/// <summary> | |
/// Checks if the given ReadOnlySpan<byte> contains only ASCII characters. | |
/// The method uses hardware intrinsics when available on x86, ARM, and ARM64 platforms. | |
/// If hardware intrinsics are not available, it falls back to a software-based implementation. | |
/// </summary> | |
/// <param name="data">The ReadOnlySpan<byte> containing the data to be checked.</param> | |
/// <returns>True if the data contains only ASCII characters, otherwise false.</returns> | |
public static bool ContainsOnlyAscii(ReadOnlySpan<byte> data) | |
{ | |
if (Sse2.IsSupported) | |
{ | |
return ContainsOnlyAscii_X86(data); | |
} | |
else if (AdvSimd.Arm64.IsSupported) | |
{ | |
return ContainsOnlyAscii_Arm64(data); | |
} | |
else if (AdvSimd.IsSupported) | |
{ | |
return ContainsOnlyAscii_Arm(data); | |
} | |
else | |
{ | |
return ContainsOnlyAscii_SoftwareFallback(data); | |
} | |
} | |
/// <summary> | |
/// Determines if the given ReadOnlySpan of char contains only ASCII characters. | |
/// </summary> | |
/// <param name="data">The input ReadOnlySpan of char.</param> | |
/// <returns>Returns true if all characters in the input span are ASCII characters; otherwise, returns false.</returns> | |
/// <remarks> | |
/// This method first converts the input ReadOnlySpan<char> to a byte representation using UTF-8 encoding without | |
/// allocating memory. Then, it checks if the byte representation contains only ASCII characters by calling | |
/// the ContainsOnlyAscii(ReadOnlySpan<byte>) method. | |
/// | |
/// If the input span is larger than a predefined threshold (MaxStackSize), the method uses an ArrayPool | |
/// to rent a buffer for the byte representation. Otherwise, it uses stack allocation. If a buffer is rented, | |
/// it will be returned to the ArrayPool after use. | |
/// </remarks> | |
public static bool ContainsOnlyAscii(ReadOnlySpan<char> data) | |
{ | |
const int MaxStackSize = 256; | |
int length = data.Length; | |
int maxLength = Encoding.UTF8.GetMaxByteCount(length); | |
byte[]? rentedBuffer = null; | |
Span<byte> byteData = maxLength > MaxStackSize | |
? (rentedBuffer = ArrayPool<byte>.Shared.Rent(maxLength)) | |
: stackalloc byte[MaxStackSize]; | |
try | |
{ | |
// Convert ReadOnlySpan<char> to Span<byte> using Encoding.UTF8 without allocating memory | |
int bytesWritten = Encoding.UTF8.GetBytes(data, byteData); | |
// Resize the Span<byte> to the actual number of bytes written | |
byteData = byteData[..bytesWritten]; | |
// Check if the byte representation contains only ASCII characters | |
return ContainsOnlyAscii(byteData); | |
} | |
finally | |
{ | |
// Return the rented buffer to the ArrayPool, if applicable | |
if (rentedBuffer != null) | |
{ | |
ArrayPool<byte>.Shared.Return(rentedBuffer, clearArray: true); | |
} | |
} | |
} | |
/// <summary> | |
/// Checks if the given ReadOnlySpan<byte> contains only ASCII characters using ARM64 hardware intrinsics. | |
/// </summary> | |
/// <param name="data">The ReadOnlySpan<byte> containing the data to be checked.</param> | |
/// <returns>True if the data contains only ASCII characters, otherwise false.</returns> | |
private static unsafe bool ContainsOnlyAscii_Arm64(ReadOnlySpan<byte> data) | |
{ | |
int i = 0; | |
int length = data.Length; | |
// Calculate the index of the last block that can be processed using Vector128<byte> | |
int lastBlockIndex = length - (length % Vector128<byte>.Count); | |
// Create a mask with the high bit set (0x80) to test for non-ASCII characters | |
Vector128<byte> asciiMask = Vector128.Create((byte)0x80); | |
// Use a fixed statement to pin the memory address of the ReadOnlySpan<byte> and obtain a pointer | |
fixed (byte* dataPtr = data) | |
{ | |
// Iterate over the data in blocks of Vector128<byte>.Count (16 bytes for ARM64) | |
for (; i < lastBlockIndex; i += Vector128<byte>.Count) | |
{ | |
// Load the current block into a Vector128<byte> using a pointer | |
Vector128<byte> block = AdvSimd.LoadVector128(dataPtr + i); | |
// Perform a bitwise AND with the ASCII mask | |
Vector128<byte> masked = AdvSimd.And(block, asciiMask); | |
// Check if the masked vector has any bytes with the high bit set (non-ASCII character) | |
if (AdvSimd.CompareEqual(masked, Vector128<byte>.Zero).AsByte().AsInt64().ToScalar() != -1) | |
{ | |
return false; | |
} | |
} | |
// Process the remaining bytes using ARM64 hardware intrinsics | |
if (i < length) | |
{ | |
int remaining = length - i; | |
byte* paddingPtr = stackalloc byte[Vector128<byte>.Count]; | |
Span<byte> paddingSpan = new Span<byte>(paddingPtr, Vector128<byte>.Count); | |
// Zero out the padding buffer | |
//paddingSpan.Clear(); | |
// Copy the remaining bytes into the padding buffer | |
data.Slice(i).CopyTo(paddingSpan); | |
Vector128<byte> block = AdvSimd.LoadVector128(paddingPtr); | |
Vector128<byte> masked = AdvSimd.And(block, asciiMask); | |
if (AdvSimd.CompareEqual(masked, Vector128<byte>.Zero).AsByte().AsInt64().ToScalar() != -1) | |
{ | |
return false; | |
} | |
} | |
} | |
return true; | |
} | |
/// <summary> | |
/// Checks if the given ReadOnlySpan<byte> contains only ASCII characters using ARM hardware intrinsics. | |
/// </summary> | |
/// <param name="data">The ReadOnlySpan<byte> containing the data to be checked.</param> | |
/// <returns>True if the data contains only ASCII characters, otherwise false.</returns> | |
private static unsafe bool ContainsOnlyAscii_Arm(ReadOnlySpan<byte> data) | |
{ | |
int i = 0; | |
int length = data.Length; | |
// Calculate the index of the last block that can be processed using Vector64<byte> | |
int lastBlockIndex = length - (length % Vector64<byte>.Count); | |
// Create a mask with the high bit set (0x80) to test for non-ASCII characters | |
Vector64<byte> asciiMask = Vector64.Create((byte)0x80); | |
// Use a fixed statement to pin the memory address of the ReadOnlySpan<byte> and obtain a pointer | |
fixed (byte* dataPtr = data) | |
{ | |
// Iterate over the data in blocks of Vector64<byte>.Count (8 bytes for ARM) | |
for (; i < lastBlockIndex; i += Vector64<byte>.Count) | |
{ | |
// Load the current block into a Vector64<byte> using a pointer | |
Vector64<byte> block = AdvSimd.LoadVector64(dataPtr + i); | |
// Perform a bitwise AND with the ASCII mask | |
Vector64<byte> masked = AdvSimd.And(block, asciiMask); | |
// Check if the masked vector has any bytes with the high bit set (non-ASCII character) | |
if (AdvSimd.CompareEqual(masked, Vector64<byte>.Zero).AsByte().AsInt32().ToScalar() != -1) | |
{ | |
return false; | |
} | |
} | |
// Process the remaining bytes using ARM hardware intrinsics | |
if (i < length) | |
{ | |
int remaining = length - i; | |
byte* paddingPtr = stackalloc byte[Vector64<byte>.Count]; | |
Span<byte> paddingSpan = new Span<byte>(paddingPtr, Vector64<byte>.Count); | |
// Zero out the padding buffer | |
//paddingSpan.Clear(); | |
// Copy the remaining bytes into the padding buffer | |
data.Slice(i).CopyTo(paddingSpan); | |
Vector64<byte> block = AdvSimd.LoadVector64(paddingPtr); | |
Vector64<byte> masked = AdvSimd.And(block, asciiMask); | |
if (AdvSimd.CompareEqual(masked, Vector64<byte>.Zero).AsByte().AsInt32().ToScalar() != -1) | |
{ | |
return false; | |
} | |
} | |
} | |
return true; | |
} | |
/// <summary> | |
/// Checks if the given ReadOnlySpan<byte> contains only ASCII characters using x86 hardware intrinsics. | |
/// </summary> | |
/// <param name="data">The ReadOnlySpan<byte> containing the data to be checked.</param> | |
/// <returns>True if the data contains only ASCII characters, otherwise false.</returns> | |
private static unsafe bool ContainsOnlyAscii_X86(ReadOnlySpan<byte> data) | |
{ | |
int i = 0; | |
int length = data.Length; | |
// Calculate the index of the last block that can be processed using Vector128<byte> | |
int lastBlockIndex = length - (length % Vector128<byte>.Count); | |
// Create a mask with the high bit set (0x80) to test for non-ASCII characters | |
Vector128<byte> asciiMask = Vector128.Create((byte)0x80); | |
// Use a fixed statement to pin the memory address of the ReadOnlySpan<byte> and obtain a pointer | |
fixed (byte* dataPtr = data) | |
{ | |
// Iterate over the data in blocks of Vector128<byte>.Count (16 bytes for x86) | |
for (; i < lastBlockIndex; i += Vector128<byte>.Count) | |
{ | |
// Load the current block into a Vector128<byte> using a pointer | |
Vector128<byte> block = Sse2.LoadVector128(dataPtr + i); | |
// Perform a bitwise AND with the ASCII mask | |
Vector128<byte> masked = Sse2.And(block, asciiMask); | |
// Use Sse2.MoveMask to create an integer mask from the most significant bit of each byte | |
// in the masked Vector128<byte> | |
int mask = Sse2.MoveMask(masked.AsSByte()); | |
// Test if any of the masked bytes have the high bit set (non-ASCII character) | |
if (mask != 0) | |
{ | |
return false; | |
} | |
} | |
} | |
return true; | |
} | |
/// <summary> | |
/// Checks if the given ReadOnlySpan<byte> contains only ASCII characters using a software-based implementation. | |
/// This implementation uses a for loop to access elements in the ReadOnlySpan<byte>, allowing the compiler to | |
/// optimize the code more aggressively, such as bypassing bound checks. | |
/// </summary> | |
/// <param name="data">The ReadOnlySpan<byte> containing the data to be checked.</param> | |
/// <returns>True if the data contains only ASCII characters, otherwise false.</returns> | |
private static bool ContainsOnlyAscii_SoftwareFallback(ReadOnlySpan<byte> data) | |
{ | |
int length = data.Length; | |
for (int i = 0; i < length; i++) | |
{ | |
if (data[i] >= 0x80) | |
{ | |
return false; | |
} | |
} | |
return true; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment