-
-
Save FabienDehopre/5245476 to your computer and use it in GitHub Desktop.
using System; | |
using System.Linq; | |
using System.Text.RegularExpressions; | |
public static class IdentifierExtensions | |
{ | |
// definition of a valid C# identifier: http://msdn.microsoft.com/en-us/library/aa664670(v=vs.71).aspx | |
private const string FORMATTING_CHARACTER = @"\p{Cf}"; | |
private const string CONNECTING_CHARACTER = @"\p{Pc}"; | |
private const string DECIMAL_DIGIT_CHARACTER = @"\p{Nd}"; | |
private const string COMBINING_CHARACTER = @"\p{Mn}|\p{Mc}"; | |
private const string LETTER_CHARACTER = @"\p{Lu}|\p{Ll}|\p{Lt}|\p{Lm}|\p{Lo}|\p{Nl}"; | |
private const string IDENTIFIER_PART_CHARACTER = LETTER_CHARACTER + "|" + | |
DECIMAL_DIGIT_CHARACTER + "|" + | |
CONNECTING_CHARACTER + "|" + | |
COMBINING_CHARACTER + "|" + | |
FORMATTING_CHARACTER; | |
private const string IDENTIFIER_PART_CHARACTERS = "(" + IDENTIFIER_PART_CHARACTER + ")+"; | |
private const string IDENTIFIER_START_CHARACTER = "(" + LETTER_CHARACTER + "|_)"; | |
private const string IDENTIFIER_OR_KEYWORD = IDENTIFIER_START_CHARACTER + "(" + | |
IDENTIFIER_PART_CHARACTERS + ")*"; | |
// C# keywords: http://msdn.microsoft.com/en-us/library/x53a06bb(v=vs.71).aspx | |
private static readonly HashSet<string> _keywords = new HashSet<string> | |
{ | |
"__arglist", "__makeref", "__reftype", "__refvalue", | |
"abstract", "as", "base", "bool", | |
"break", "byte", "case", "catch", | |
"char", "checked", "class", "const", | |
"continue", "decimal", "default", "delegate", | |
"do", "double", "else", "enum", | |
"event", "explicit", "extern", "false", | |
"finally", "fixed", "float", "for", | |
"foreach", "goto", "if", "implicit", | |
"in", "int", "interface", "internal", | |
"is", "lock", "long", "namespace", | |
"new", "null", "object", "operator", | |
"out", "override", "params", "private", | |
"protected", "public", "readonly", "ref", | |
"return", "sbyte", "sealed", "short", | |
"sizeof", "stackalloc", "static", "string", | |
"struct", "switch", "this", "throw", | |
"true", "try", "typeof", "uint", | |
"ulong", "unchecked", "unsafe", "ushort", | |
"using", "virtual", "volatile", "void", | |
"while" | |
}; | |
private static readonly Regex _validIdentifierRegex = new Regex("^" + IDENTIFIER_OR_KEYWORD + "$", RegexOptions.Compiled); | |
public static bool IsValidIdentifier(this string identifier) | |
{ | |
if (String.IsNullOrWhiteSpace(identifier)) return false; | |
var normalizedIdentifier = identifier.Normalize(); | |
// 1. check that the identifier match the validIdentifer regex and it's not a C# keyword | |
if (_validIdentifierRegex.IsMatch(normalizedIdentifier) && !_keywords.Contains(normalizedIdentifier)) | |
{ | |
return true; | |
} | |
// 2. check if the identifier starts with @ | |
if (normalizedIdentifier.StartsWith("@") && _validIdentifierRegex.IsMatch(normalizedIdentifier.Substring(1))) | |
{ | |
return true; | |
} | |
// 3. it's not a valid identifier | |
return false; | |
} | |
} |
It would be worthwhile adding contextual keywords to the list. There have been no new reserved keywords from C# 1.0 to avoid breaking older code, but they are reserved in their context.
https://gist.github.com/nuno-andre/ae449bbfa8d5d4c98746050a5d10793a
License?
It's free to use.
Please replace "unchekeced" with "unchecked" in the keywords array.
@SolidAlloy I fixed the typo you reported. Thanks
You could probably make this faster by replacing the string[] _keywords
with HashSet<string> _keywords
. In line 60 where you are using Contains(...)
, you are "secretly" using LINQ, which is secretly downcasting your string[]
to IEnumerable<string>
. A good optimization for things that should run fast is to see if you can get rid of using System.Linq
at the top of your code.
Here is a test that highlights the difference. It can be quite dramatic with larger inputs:
public class LINQ_VS_HashSet
{
private static readonly string[] _array = new[]
{
"abstract", "event", "new", "struct",
"as", "explicit", "null", "switch",
"base", "extern", "object", "this",
"bool", "false", "operator", "throw",
"break", "finally", "out", "true",
"byte", "fixed", "override", "try",
"case", "float", "params", "typeof",
"catch", "for", "private", "uint",
"char", "foreach", "protected", "ulong",
"checked", "goto", "public", "unchecked",
"class", "if", "readonly", "unsafe",
"const", "implicit", "ref", "ushort",
"continue", "in", "return", "using",
"decimal", "int", "sbyte", "virtual",
"default", "interface", "sealed", "volatile",
"delegate", "internal", "short", "void",
"do", "is", "sizeof", "while",
"double", "lock", "stackalloc",
"else", "long", "static",
"enum", "namespace", "string"
};
private static readonly HashSet<string> _hashset = new HashSet<string>
{
"abstract", "event", "new", "struct",
"as", "explicit", "null", "switch",
"base", "extern", "object", "this",
"bool", "false", "operator", "throw",
"break", "finally", "out", "true",
"byte", "fixed", "override", "try",
"case", "float", "params", "typeof",
"catch", "for", "private", "uint",
"char", "foreach", "protected", "ulong",
"checked", "goto", "public", "unchecked",
"class", "if", "readonly", "unsafe",
"const", "implicit", "ref", "ushort",
"continue", "in", "return", "using",
"decimal", "int", "sbyte", "virtual",
"default", "interface", "sealed", "volatile",
"delegate", "internal", "short", "void",
"do", "is", "sizeof", "while",
"double", "lock", "stackalloc",
"else", "long", "static",
"enum", "namespace", "string"
};
public static void Test()
{
const string tester = "This is a short test string that will show the typeof behavior expected";
const int iterations = 500000;
var testArray = tester.Split( ' ' );
var arrayTimer = new Stopwatch();
var hashsetTimer = new Stopwatch();
var arrayMatches = 0;
var hashSetMatches = 0;
//
// Warmup
//
for( int i = 0; i < 100; i++ )
{
//
// HashSet
//
for( int j = 0; j < testArray.Length; j++ )
{
var word = testArray[j];
_ = _hashset.Contains( word );
}
//
// Array
//
for( int j = 0; j < testArray.Length; j++ )
{
var word = testArray[j];
_ = _array.Contains( word );
}
}
//
// Test
//
//
// Array
//
arrayTimer.Start();
for( int i = 0; i < iterations; i++ )
{
for( int j = 0; j < testArray.Length; j++ )
{
var word = testArray[j];
if( _array.Contains( word ) )
{
arrayMatches++;
}
}
}
arrayTimer.Stop();
//
// HashSet
//
hashsetTimer.Start();
for( int i = 0; i < iterations; i++ )
{
for( int j = 0; j < testArray.Length; j++ )
{
var word = testArray[j];
if( _hashset.Contains( word ) )
{
hashSetMatches++;
}
}
}
hashsetTimer.Stop();
Console.WriteLine( $"Array: {iterations} iterations, {arrayMatches} matches, {arrayTimer.ElapsedMilliseconds} ms" );
Console.WriteLine( $"HashSet: {iterations} iterations, {hashSetMatches} matches, {hashsetTimer.ElapsedMilliseconds} ms" );
}
}
This gives the following results on my machine:
Array: 500000 iterations, 2000000 matches, 1528 ms
HashSet: 500000 iterations, 2000000 matches, 104 ms
Press any key to continue . . .
Thanks @bpierson. I've update the gist.
Of course, the list of keywords should also be updated but I've currently not the free time to do it.
Your hashset is missing four keywords: __arglist, __makeref, __reftype and __refvalue. Though they are not mentioned in the documentation, they exist and are not valid identifiers.
Thanks @piotrstenke. I've updated the list of identifiers.
I also took the opportunity to rearrange the list of identifiers and to format the code using the dotnet-format tool.
License?