Last active
September 2, 2020 14:28
-
-
Save Estecka/604934df6b6fc441fe934026f665fc59 to your computer and use it in GitHub Desktop.
AngryCSVparser.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Collections; | |
using System.Collections.Generic; | |
using System.Text.RegularExpressions; | |
namespace Estecka { | |
/// <summary> | |
/// A parser for those CSV with quoted newlines that tend to mess up other parsers. | |
/// </summary> | |
static public class AngryCSVParser { | |
static char escapeCharacter = '\\'; | |
static char stringDelimiter = '\"'; | |
static char rowSeparator = '\n'; | |
static char cellSeparator = ','; | |
static Regex newlineRegex = new Regex( | |
@"\r\n|\r(?!=\n)|\n(?<!=\r)", | |
RegexOptions.Multiline | RegexOptions.CultureInvariant | |
); | |
/// <summary> | |
/// Replaces all newlines in a string with the provided format. | |
/// </summary> | |
/// <param name="input">The string that needs formatting</param> | |
/// <param name="lineFormat">Should be either "\n", "\r", or "\r\n"</param> | |
static public string FormatNewlines(string input, string lineFormat){ | |
return newlineRegex.Replace(input, lineFormat); | |
} | |
/// <summary> | |
/// Parses a string by breaking it into pieces, then gluing back together those that should not have been broken. | |
/// <br/> Although it was originally written to break a csv into rows, it turns out the same logic can be used to break rows into cells. | |
/// </summary> | |
/// <param name="whole">The string to break down.</param> | |
/// <param name="separator">The character to use as a separator.</param> | |
/// <param name="discardEmpty">If true, empty pieces will be discarded.</param> | |
static private List<string> ParseComponents(string whole, char separator, bool discardEmpty){ | |
string[] pieces = whole.Split(separator); | |
List<string> rows = new List<string>(pieces.Length); | |
// When true, it means we're parsing a quoted string data within the csv. | |
// As such, separators are ignored an treated as data. | |
// This stays true until another non-escaped quote is met. | |
bool isInline = false; | |
// When true, the next character will be escaped. | |
// This escapes string delimiters and escape characters, but not separators. | |
// This only stays true for one character. | |
bool isEscaped = false; | |
int row = -1; | |
for (int line=0; line<pieces.Length; line++){ | |
if (isInline){ | |
// The previous separator was discarded, but is actually part of a string data, so we restore it. | |
rows[row] += separator; | |
} | |
else { | |
// Initializes a new row; | |
rows.Add(null); | |
row++; | |
} | |
// An escape character at the end of the previous row should not escape anything. | |
isEscaped = false; | |
string lineValue = pieces[line]; | |
foreach (char c in lineValue){ | |
if (isEscaped) { | |
isEscaped = false; | |
continue; | |
} | |
else if (c == escapeCharacter){ | |
isEscaped = true; | |
continue; | |
} | |
else if (c == stringDelimiter){ | |
isInline = !isInline; | |
} | |
} | |
rows[row] += lineValue; | |
} | |
if (discardEmpty) | |
rows.RemoveAll(r => string.IsNullOrEmpty(r)); | |
return rows; | |
} | |
static private List<string> BreakToRows(string csv){ | |
return ParseComponents(csv, rowSeparator, discardEmpty:true); | |
} | |
static private List<string> BreakToCells(string line){ | |
return ParseComponents(line, cellSeparator, discardEmpty:false); | |
} | |
/// <summary> | |
/// Trims whitespaces and string delimiters from a cell's value. | |
/// </summary> | |
static private string CleanCell(string cell){ | |
return cell | |
.Trim() | |
.Trim(stringDelimiter); | |
; | |
} | |
/// <summary> | |
/// Turn every row from a CSV into a list. | |
/// </summary> | |
/// <param name="csv"></param> | |
/// <returns></returns> | |
static public List<List<string>> CsvToLists(string csv){ | |
List<string> rows = BreakToRows(csv); | |
var results = new List<List<string>>(rows.Count); | |
foreach(string row in rows){ | |
var cells = BreakToCells(row); | |
for (int i=0; i<cells.Count; i++) | |
cells[i] = CleanCell(cells[i]); | |
results.Add(cells); | |
} | |
return results; | |
} | |
/// <summary> | |
/// Turn every row from a CSV into a dictionnary. | |
/// <br/>The first row is actually discarded, and its values used as keys for every other dictionnary. | |
/// <br/>Result is undefined for values with duplicate or empty keys. | |
/// </summary> | |
/// <param name="csv"></param> | |
/// <returns></returns> | |
static public Dictionary<string, string>[] CsvToHashtables(string csv){ | |
List<string> rows = BreakToRows(csv); | |
var results = new Dictionary<string, string>[rows.Count-1]; | |
List<string> keys = BreakToCells(rows[0]); | |
for (int i=0; i<keys.Count; i++) | |
keys[i] = CleanCell(keys[i]); | |
for (int line=1; line<rows.Count; line++){ | |
List<string> row = BreakToCells(rows[line]); | |
var entry = results[line-1] = new Dictionary<string, string>(row.Count); | |
for (int cell=0; cell<row.Count && cell<keys.Count; cell++){ | |
string label = keys[cell] ?? string.Empty; | |
string value = CleanCell(row[cell]); | |
entry[label] = value; | |
} | |
} | |
return results; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment