Last active
January 24, 2019 15:16
-
-
Save ctigeek/4950c4b07a50a0791f012858e3bb2214 to your computer and use it in GitHub Desktop.
Split a string on a character, but don't split based on start-end characters.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// <summary> | |
/// Split a string while preserving sections of it. | |
/// Similar to string.Split, but you can define start-end characters (e.g. quotes, brackets, braces) inside of which it will NOT split. | |
/// Preservers can also be "recursive" which means it can determine if it's in nested brackets or parens, etc. | |
/// If the start & end characters are different, there's a good chance you want to set recursive to true. | |
/// See the associated unit test for an example that can parse json.... | |
/// Also supports escape characters so the separator and start-end characters can be ignored. | |
/// </summary> | |
public class Splitter | |
{ | |
public static string[] Split(string splitThis, char separator, char separatorEscape, params Preserver[] preservers) | |
{ | |
if (string.IsNullOrEmpty(splitThis)) | |
{ | |
return null; | |
} | |
foreach (var preserver in preservers) | |
{ | |
if (preserver.Start == separator) | |
{ | |
throw new Exception("The separator character `" + separator + "` cannot be the same as the start character of a preserver."); | |
} | |
preserver.IsInside = false; | |
if (preserver.Recursive && preserver.Start == preserver.End) | |
{ | |
throw new Exception("Error. Preserver cannot be recursive if start and end are the same character: " + preserver.Start); | |
} | |
} | |
Queue<int> splitPoints = new Queue<int>(); | |
int position = 0; | |
while (position < splitThis.Length) | |
{ | |
Console.WriteLine(splitThis.Substring(0, position + 1)); | |
var currChar = splitThis[position]; | |
var prevChar = position == 0 ? '\0' : splitThis[position - 1]; | |
var insidePreserver = preservers.FirstOrDefault(p => p.IsInside); | |
if (insidePreserver != null) | |
{ | |
ShouldWeExitAPreserver(currChar, prevChar, insidePreserver); | |
} | |
else | |
{ | |
if (!preservers.Any(p => ShouldWeEnterAPreserver(currChar, prevChar, p))) | |
{ | |
if (currChar == separator && prevChar != separatorEscape) | |
{ | |
splitPoints.Enqueue(position); | |
Console.WriteLine("Found a split."); | |
} | |
} | |
} | |
position++; | |
} | |
if (splitPoints.Count == 0) | |
{ | |
return new[] {splitThis}; | |
} | |
var result = new string[splitPoints.Count + 1]; | |
int resultIndex = 0; | |
var startAt = 0; | |
while (splitPoints.Count > 0) | |
{ | |
var splitPoint = splitPoints.Dequeue(); | |
if (splitPoint - startAt > 0) | |
{ | |
result[resultIndex] = splitThis.Substring(startAt, splitPoint - startAt); | |
} | |
else | |
{ | |
result[resultIndex] = string.Empty; | |
} | |
startAt = splitPoint + 1; | |
resultIndex++; | |
if (splitPoints.Count == 0) | |
{ | |
if (startAt < splitThis.Length) | |
{ | |
result[resultIndex] = splitThis.Substring(startAt); | |
} | |
else | |
{ | |
result[resultIndex] = string.Empty; | |
} | |
} | |
} | |
return result; | |
} | |
private static bool ShouldWeEnterAPreserver(char currChar, char prevChar, Preserver preserver) | |
{ | |
if (preserver.IsInside) | |
{ | |
if (preserver.SubPreserver != null) | |
{ | |
ShouldWeEnterAPreserver(currChar, prevChar, preserver.SubPreserver); | |
} | |
return false; | |
} | |
if (preserver.Start == currChar && preserver.Escape != prevChar) | |
{ | |
preserver.IsInside = true; | |
Console.WriteLine("Entering with char " + currChar); | |
return true; | |
} | |
return false; | |
} | |
private static bool ShouldWeExitAPreserver(char currChar, char prevChar, Preserver preserver) | |
{ | |
if (!preserver.IsInside) return false; | |
if (preserver.SubPreserver != null) | |
{ | |
if (preserver.SubPreserver.IsInside) | |
{ | |
if (ShouldWeExitAPreserver(currChar, prevChar, preserver.SubPreserver)) | |
{ | |
return false; | |
} | |
} | |
else | |
{ | |
if (ShouldWeEnterAPreserver(currChar, prevChar, preserver.SubPreserver)) | |
{ | |
return false; | |
} | |
} | |
} | |
if (preserver.Recursive) | |
{ | |
var recursivePreserver = new Preserver { IsInside = false, Recursive = false, Start = preserver.Start, End = preserver.End, Escape = preserver.Escape }; | |
if (ShouldWeEnterAPreserver(currChar, prevChar, recursivePreserver)) | |
{ | |
preserver.RecursiveCount++; | |
Console.WriteLine("Entering Recursive, count=" + preserver.RecursiveCount); | |
return false; | |
} | |
} | |
if (preserver.End == currChar && preserver.Escape != prevChar) | |
{ | |
if (preserver.RecursiveCount > 0) | |
{ | |
preserver.RecursiveCount--; | |
Console.WriteLine("Exiting recursive preserver, count=" + preserver.RecursiveCount); | |
} | |
else | |
{ | |
preserver.IsInside = false; | |
Console.WriteLine("Exiting with char " + currChar); | |
} | |
return true; | |
} | |
return false; | |
} | |
public class Preserver | |
{ | |
public char Start { get; set; } | |
public char End { get; set; } | |
public char Escape { get; set; } = '\u0002'; | |
public bool IsInside { get; set; } | |
public bool Recursive { get; set; } | |
public int RecursiveCount { get; set; } = 0; | |
public Preserver SubPreserver { get; set; } | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private static readonly Splitter.Preserver quotePreserver = new Splitter.Preserver { Start = '"', End = '"', Escape = '\\' }; | |
private static readonly Splitter.Preserver curlyBoiPreserver = new Splitter.Preserver { Start = '{', End = '}', Escape = '\\', Recursive = true, SubPreserver = QuotePreserver }; | |
private static readonly Splitter.Preserver bracketPreserver = new Splitter.Preserver { Start = '[', End = ']', Escape = '\\', Recursive = true, SubPreserver = QuotePreserver }; | |
[Test] | |
public void TryToBreakStuff() | |
{ | |
const string splitThis = @"as,a{}{}[]{[]}[[}}}}(])()()s,as,[as,""a]s"",as],as "; | |
var result = Splitter.Split(splitThis, ',', '\0', quotePreserver, curlyBoiPreserver, bracketPreserver); | |
Assert.That(result.Length, Is.EqualTo(5)); | |
} | |
[Test] | |
public void TestEscapes() | |
{ | |
const string splitThis = @"as\,as,as, [as,""a]s\"""",as,as,as],,\""as,as "; | |
var quotePreserver = new Splitter.Preserver { Start = '"', End = '"', Escape = '\\' }; | |
var curlyBoiPreserver = new Splitter.Preserver { Start = '{', End = '}', Escape = '\\', SubPreserver = quotePreserver }; | |
var bracketPreserver = new Splitter.Preserver { Start = '[', End = ']', Escape = '\\', SubPreserver = quotePreserver }; | |
var result = Splitter.Split(splitThis, ',', '\\', quotePreserver, curlyBoiPreserver, bracketPreserver); | |
Assert.That(result.Length, Is.EqualTo(6)); | |
} | |
[Test] | |
public void SplitTestJson() | |
{ | |
const string splitThis = @" | |
""blah"":""value"", | |
""blah2"":""va,lue2"", | |
""blah3"":{ | |
""blah4"":""value"", | |
""blah5"":[""v,alue"",""va,lue"",""value""] | |
}, | |
""blah4"":[""value"",""va,lue"",""value""]"; | |
var quotePreserver = new Splitter.Preserver {Start = '"', End = '"', Escape = '\\'}; | |
var curlyBoiPreserver = new Splitter.Preserver { Start = '{', End = '}', Escape = '\\' }; | |
var bracketPreserver = new Splitter.Preserver { Start = '[', End = ']', Escape = '\\' }; | |
var result = Splitter.Split(splitThis, ',', '\0', quotePreserver, curlyBoiPreserver, bracketPreserver); | |
Assert.That(result.Length, Is.EqualTo(4)); | |
var result2 = Splitter.Split(result[2].Trim(), ':', '\0', quotePreserver, curlyBoiPreserver, bracketPreserver); | |
Assert.That(result2.Length, Is.EqualTo(2)); | |
var result3 = Splitter.Split(result[3].Trim(), ':', '\0', quotePreserver, curlyBoiPreserver, bracketPreserver); | |
Assert.That(result3.Length, Is.EqualTo(2)); | |
var result4 = Splitter.Split(result3[1].Trim(), ',', '\0', quotePreserver, curlyBoiPreserver, bracketPreserver); | |
Assert.That(result4.Length, Is.EqualTo(1)); | |
var result5 = Splitter.Split(result3[1].Trim(), ',', '\0', quotePreserver, curlyBoiPreserver); | |
Assert.That(result5.Length, Is.EqualTo(3)); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment