Last active
August 22, 2018 00:23
-
-
Save chanibal/c9d77585f8bcca1ea8894e25352764c5 to your computer and use it in GitHub Desktop.
A simple utility for normalizing csv input
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define debuglog | |
using System; | |
using System.Diagnostics; | |
using System.Text; | |
using Microsoft.VisualStudio.TestTools.UnitTesting; | |
/// Normalizes CSV lines to fully escaped form - for use in database imports | |
/// Ex. | |
/// a,b,c -> 'a','b','c' | |
/// a,"b,c",d -> 'a','b,c','d' | |
/// a,it's smth,c -> 'a','it''s smth','c' | |
sealed class CSVEscaper | |
{ | |
private enum State | |
{ | |
start, | |
intext, | |
inquote, | |
endquote | |
} | |
private bool IsQuote(char c) { return c == '"'; } | |
private bool IsSep(char c) { return c == ','; } | |
const char fieldquote = '\''; | |
const char separator = ','; | |
private StringBuilder _output = new StringBuilder(); | |
private void Output(char c) | |
{ | |
#if debuglog | |
Log("Outputting " + c); | |
#endif | |
_output.Append(c); | |
} | |
private State _st; | |
private void St(State st) | |
{ | |
#if debuglog | |
Log("State " + _st +" -> " + st); | |
#endif | |
_st = st; | |
} | |
#if debuglog | |
private void Log(string msg) | |
{ | |
// Debug.Wr iteLine(msg); | |
// Console.WriteLine(msg); | |
Trace.WriteLine(msg); | |
//_output += "{" + msg +"}"; | |
} | |
#endif | |
public string Escape(string input) | |
{ | |
_output.Clear(); | |
_st = State.start; | |
#if debuglog | |
Log("Input: " + input); | |
#endif | |
foreach(char c in input) { | |
#if debuglog | |
Log("Char: " + c); | |
#endif | |
switch(_st) { | |
case State.start: | |
if(IsQuote(c)) { // quoted field | |
St(State.inquote); | |
Output(fieldquote); | |
} | |
else if(IsSep(c)) // empty field | |
{ | |
Output(fieldquote); | |
Output(fieldquote); | |
Output(separator); | |
St(State.start); | |
} | |
else { // unqouted field | |
St(State.intext); | |
Output(fieldquote); | |
Output(c); | |
} | |
break; | |
case State.inquote: | |
if(IsQuote(c)) { // ending quote | |
St(State.endquote); | |
} | |
else { | |
Output(c); | |
} | |
break; | |
case State.endquote: | |
if(IsQuote(c)) { // just an escaped quote | |
Output(c); | |
} | |
else if(IsSep(c)) { // end of quote | |
Output(fieldquote); | |
Output(separator); | |
St(State.start); | |
} | |
else { | |
throw new Exception("Illegal quote end sequence"); | |
} | |
break; | |
case State.intext: | |
if(IsSep(c)) { | |
Output(fieldquote); | |
Output(separator); | |
St(State.start); | |
} | |
else if(c == fieldquote) { // escape single input quote to double quote | |
Output(fieldquote); | |
Output(fieldquote); | |
} | |
else { | |
Output(c); | |
} | |
break; | |
} | |
} | |
if(_st == State.start) // if this was at start, that means it ended with an empty field | |
Output(fieldquote); | |
Output(fieldquote); // close the current field | |
#if debuglog | |
Log("Return: " + _output); | |
#endif | |
return _output.ToString(); | |
} | |
} | |
/// To run unit tests: | |
/// $ dotnet new mstest | |
/// $ dotnet test | |
[TestClass] | |
public class CSVEscaperUnitTest | |
{ | |
private CSVEscaper escaper; | |
[TestInitialize] | |
public void TestInitialize() | |
{ | |
escaper = new CSVEscaper(); | |
} | |
void Check(string expected, string unescaped) | |
{ | |
Assert.AreEqual(expected, escaper.Escape(unescaped)); | |
} | |
[TestMethod] | |
public void TestCSV() | |
{ | |
Check("'xyz','a','b','c'", "xyz,a,b,c"); | |
} | |
[TestMethod] | |
public void TestCSVQuote() | |
{ | |
Check("'a','b','c'", "a,\"b\",c"); | |
} | |
[TestMethod] | |
public void TestCSVQuoteWithComma() | |
{ | |
Check("'a','b,x','c'", "a,\"b,x\",c"); | |
} | |
[TestMethod] | |
public void TestCSVSingleQuote() | |
{ | |
Check("'a','it''s smth','c'", "a,it's smth,c"); | |
} | |
[TestMethod] | |
public void TestCSVEmpty() | |
{ | |
Check("'','',''", ",,"); | |
Check("'','',''", "\"\",\"\",\"\""); | |
} | |
[TestMethod] | |
public void TestCSVEmptyVals() | |
{ | |
Check("'v',''", "v,"); | |
Check("'','v'", ",v"); | |
Check("'v','','v'", "v,,v"); | |
Check("'','v',''", ",v,"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment