Created
January 20, 2018 13:56
-
-
Save leidegre/eb197e71bdfaba95907ea949b86e5ce9 to your computer and use it in GitHub Desktop.
CSV parser, compliant with RFC4180 can be customized to do new line normalization and does handle new line in quoted string literal without reading all text up front
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static class CSV | |
{ | |
public struct Record | |
{ | |
public readonly string[] Row; | |
public string this[int index] => Row[index]; | |
public Record(string[] row) | |
{ | |
Row = row; | |
} | |
} | |
public static List<Record> ParseText(string text) | |
{ | |
return Parse(new StringReader(text)); | |
} | |
public static List<Record> ParseFile(string fn) | |
{ | |
using (var reader = File.OpenText(fn)) | |
{ | |
return Parse(reader); | |
} | |
} | |
public static List<Record> Parse(TextReader reader) | |
{ | |
var data = new List<Record>(); | |
var col = new StringBuilder(); | |
var row = new List<string>(); | |
for (; ; ) | |
{ | |
var ln = reader.ReadLine(); | |
if (ln == null) break; | |
if (Tokenize(ln, col, row)) | |
{ | |
data.Add(new Record(row.ToArray())); | |
row.Clear(); | |
} | |
} | |
return data; | |
} | |
public static bool Tokenize(string s, StringBuilder col, List<string> row) | |
{ | |
int i = 0; | |
if (col.Length > 0) | |
{ | |
col.AppendLine(); // continuation | |
if (!TokenizeQuote(s, ref i, col, row)) | |
{ | |
return false; | |
} | |
} | |
while (i < s.Length) | |
{ | |
var ch = s[i]; | |
if (ch == ',') | |
{ | |
row.Add(col.ToString().Trim()); | |
col.Length = 0; | |
i++; | |
} | |
else if (ch == '"') | |
{ | |
i++; | |
if (!TokenizeQuote(s, ref i, col, row)) | |
{ | |
return false; | |
} | |
} | |
else | |
{ | |
col.Append(ch); | |
i++; | |
} | |
} | |
if (col.Length > 0) | |
{ | |
row.Add(col.ToString().Trim()); | |
col.Length = 0; | |
} | |
return true; | |
} | |
public static bool TokenizeQuote(string s, ref int i, StringBuilder col, List<string> row) | |
{ | |
while (i < s.Length) | |
{ | |
var ch = s[i]; | |
if (ch == '"') | |
{ | |
// escape sequence | |
if (i + 1 < s.Length && s[i + 1] == '"') | |
{ | |
col.Append('"'); | |
i++; | |
i++; | |
continue; | |
} | |
i++; | |
return true; | |
} | |
else | |
{ | |
col.Append(ch); | |
i++; | |
} | |
} | |
return false; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment