-
-
Save FabienDehopre/5245476 to your computer and use it in GitHub Desktop.
using System; | |
using System.Linq; | |
using System.Text.RegularExpressions; | |
public static class IdentifierExtensions | |
{ | |
// definition of a valid C# identifier: http://msdn.microsoft.com/en-us/library/aa664670(v=vs.71).aspx | |
private const string FORMATTING_CHARACTER = @"\p{Cf}"; | |
private const string CONNECTING_CHARACTER = @"\p{Pc}"; | |
private const string DECIMAL_DIGIT_CHARACTER = @"\p{Nd}"; | |
private const string COMBINING_CHARACTER = @"\p{Mn}|\p{Mc}"; | |
private const string LETTER_CHARACTER = @"\p{Lu}|\p{Ll}|\p{Lt}|\p{Lm}|\p{Lo}|\p{Nl}"; | |
private const string IDENTIFIER_PART_CHARACTER = LETTER_CHARACTER + "|" + | |
DECIMAL_DIGIT_CHARACTER + "|" + | |
CONNECTING_CHARACTER + "|" + | |
COMBINING_CHARACTER + "|" + | |
FORMATTING_CHARACTER; | |
private const string IDENTIFIER_PART_CHARACTERS = "(" + IDENTIFIER_PART_CHARACTER + ")+"; | |
private const string IDENTIFIER_START_CHARACTER = "(" + LETTER_CHARACTER + "|_)"; | |
private const string IDENTIFIER_OR_KEYWORD = IDENTIFIER_START_CHARACTER + "(" + | |
IDENTIFIER_PART_CHARACTERS + ")*"; | |
// C# keywords: http://msdn.microsoft.com/en-us/library/x53a06bb(v=vs.71).aspx | |
private static readonly HashSet<string> _keywords = new HashSet<string> | |
{ | |
"__arglist", "__makeref", "__reftype", "__refvalue", | |
"abstract", "as", "base", "bool", | |
"break", "byte", "case", "catch", | |
"char", "checked", "class", "const", | |
"continue", "decimal", "default", "delegate", | |
"do", "double", "else", "enum", | |
"event", "explicit", "extern", "false", | |
"finally", "fixed", "float", "for", | |
"foreach", "goto", "if", "implicit", | |
"in", "int", "interface", "internal", | |
"is", "lock", "long", "namespace", | |
"new", "null", "object", "operator", | |
"out", "override", "params", "private", | |
"protected", "public", "readonly", "ref", | |
"return", "sbyte", "sealed", "short", | |
"sizeof", "stackalloc", "static", "string", | |
"struct", "switch", "this", "throw", | |
"true", "try", "typeof", "uint", | |
"ulong", "unchecked", "unsafe", "ushort", | |
"using", "virtual", "volatile", "void", | |
"while" | |
}; | |
private static readonly Regex _validIdentifierRegex = new Regex("^" + IDENTIFIER_OR_KEYWORD + "$", RegexOptions.Compiled); | |
public static bool IsValidIdentifier(this string identifier) | |
{ | |
if (String.IsNullOrWhiteSpace(identifier)) return false; | |
var normalizedIdentifier = identifier.Normalize(); | |
// 1. check that the identifier match the validIdentifer regex and it's not a C# keyword | |
if (_validIdentifierRegex.IsMatch(normalizedIdentifier) && !_keywords.Contains(normalizedIdentifier)) | |
{ | |
return true; | |
} | |
// 2. check if the identifier starts with @ | |
if (normalizedIdentifier.StartsWith("@") && _validIdentifierRegex.IsMatch(normalizedIdentifier.Substring(1))) | |
{ | |
return true; | |
} | |
// 3. it's not a valid identifier | |
return false; | |
} | |
} |
I agree with you. So I updated the gist to move all the constants and the regex out of the method.
License?
It would be worthwhile adding contextual keywords to the list. There have been no new reserved keywords from C# 1.0 to avoid breaking older code, but they are reserved in their context.
https://gist.github.com/nuno-andre/ae449bbfa8d5d4c98746050a5d10793a
License?
It's free to use.
Please replace "unchekeced" with "unchecked" in the keywords array.
@SolidAlloy I fixed the typo you reported. Thanks
You could probably make this faster by replacing the string[] _keywords
with HashSet<string> _keywords
. In line 60 where you are using Contains(...)
, you are "secretly" using LINQ, which is secretly downcasting your string[]
to IEnumerable<string>
. A good optimization for things that should run fast is to see if you can get rid of using System.Linq
at the top of your code.
Here is a test that highlights the difference. It can be quite dramatic with larger inputs:
public class LINQ_VS_HashSet
{
private static readonly string[] _array = new[]
{
"abstract", "event", "new", "struct",
"as", "explicit", "null", "switch",
"base", "extern", "object", "this",
"bool", "false", "operator", "throw",
"break", "finally", "out", "true",
"byte", "fixed", "override", "try",
"case", "float", "params", "typeof",
"catch", "for", "private", "uint",
"char", "foreach", "protected", "ulong",
"checked", "goto", "public", "unchecked",
"class", "if", "readonly", "unsafe",
"const", "implicit", "ref", "ushort",
"continue", "in", "return", "using",
"decimal", "int", "sbyte", "virtual",
"default", "interface", "sealed", "volatile",
"delegate", "internal", "short", "void",
"do", "is", "sizeof", "while",
"double", "lock", "stackalloc",
"else", "long", "static",
"enum", "namespace", "string"
};
private static readonly HashSet<string> _hashset = new HashSet<string>
{
"abstract", "event", "new", "struct",
"as", "explicit", "null", "switch",
"base", "extern", "object", "this",
"bool", "false", "operator", "throw",
"break", "finally", "out", "true",
"byte", "fixed", "override", "try",
"case", "float", "params", "typeof",
"catch", "for", "private", "uint",
"char", "foreach", "protected", "ulong",
"checked", "goto", "public", "unchecked",
"class", "if", "readonly", "unsafe",
"const", "implicit", "ref", "ushort",
"continue", "in", "return", "using",
"decimal", "int", "sbyte", "virtual",
"default", "interface", "sealed", "volatile",
"delegate", "internal", "short", "void",
"do", "is", "sizeof", "while",
"double", "lock", "stackalloc",
"else", "long", "static",
"enum", "namespace", "string"
};
public static void Test()
{
const string tester = "This is a short test string that will show the typeof behavior expected";
const int iterations = 500000;
var testArray = tester.Split( ' ' );
var arrayTimer = new Stopwatch();
var hashsetTimer = new Stopwatch();
var arrayMatches = 0;
var hashSetMatches = 0;
//
// Warmup
//
for( int i = 0; i < 100; i++ )
{
//
// HashSet
//
for( int j = 0; j < testArray.Length; j++ )
{
var word = testArray[j];
_ = _hashset.Contains( word );
}
//
// Array
//
for( int j = 0; j < testArray.Length; j++ )
{
var word = testArray[j];
_ = _array.Contains( word );
}
}
//
// Test
//
//
// Array
//
arrayTimer.Start();
for( int i = 0; i < iterations; i++ )
{
for( int j = 0; j < testArray.Length; j++ )
{
var word = testArray[j];
if( _array.Contains( word ) )
{
arrayMatches++;
}
}
}
arrayTimer.Stop();
//
// HashSet
//
hashsetTimer.Start();
for( int i = 0; i < iterations; i++ )
{
for( int j = 0; j < testArray.Length; j++ )
{
var word = testArray[j];
if( _hashset.Contains( word ) )
{
hashSetMatches++;
}
}
}
hashsetTimer.Stop();
Console.WriteLine( $"Array: {iterations} iterations, {arrayMatches} matches, {arrayTimer.ElapsedMilliseconds} ms" );
Console.WriteLine( $"HashSet: {iterations} iterations, {hashSetMatches} matches, {hashsetTimer.ElapsedMilliseconds} ms" );
}
}
This gives the following results on my machine:
Array: 500000 iterations, 2000000 matches, 1528 ms
HashSet: 500000 iterations, 2000000 matches, 104 ms
Press any key to continue . . .
Thanks @bpierson. I've update the gist.
Of course, the list of keywords should also be updated but I've currently not the free time to do it.
Your hashset is missing four keywords: __arglist, __makeref, __reftype and __refvalue. Though they are not mentioned in the documentation, they exist and are not valid identifiers.
Thanks @piotrstenke. I've updated the list of identifiers.
I also took the opportunity to rearrange the list of identifiers and to format the code using the dotnet-format tool.
For performance purposes, it's probably better to move the keyword array and identifier regex strings as well as the Regex itself out as static fields instead of keeping them in-method.