Last active
November 2, 2020 07:16
-
-
Save jbtule/83e330335b1afc924d508ce49b0927fc to your computer and use it in GitHub Desktop.
My Shortest CSV Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* | |
* This work (My Shortest CSV Parser:CsvBasic.fs by James Tuley), | |
* identified by James Tuley, is free of known copyright restrictions | |
* Source: https://gist.github.com/jbtule/83e330335b1afc924d508ce49b0927fc | |
* http://creativecommons.org/publicdomain/mark/1.0/ | |
*) | |
module CsvBasic | |
open System.IO | |
type SB = System.Text.StringBuilder | |
[<Struct>] type internal Quote = None | Open | Closed | |
[<Struct>] type internal CsvState = { Table: string list list; Row: string list; Cell: SB; Quote: Quote } | |
let parseCsv (read:TextReader) = | |
let empty () = { Table = []; Row = []; Cell = SB(); Quote = None } | |
let finalizeCell = function | { Row = []; } as cs when cs.Cell.Length = 0 -> { empty () with Table = cs.Table } | |
| cs -> { empty () with Table = cs.Table; Row = (cs.Cell.ToString())::cs.Row } | |
let finalizeRow = finalizeCell >> function | { Row = [] } as cs -> {empty () with Table = cs.Table } | |
| cs -> { empty () with Table = (List.rev cs.Row)::cs.Table } | |
let finalizeTable = finalizeRow >> fun cs -> List.rev cs.Table | |
seq { while read.Peek() >= 0 do yield read.Read() |> char } | |
|> Seq.fold (fun cs c -> let pos = lazy (cs.Table.Length + 1, cs.Row.Length + 1) | |
let includeWhileQuoteIs q = { cs with Cell = cs.Cell.Append(c); Quote = q } | |
match struct (c, cs.Cell, cs.Quote) with | |
| struct ('\"', sb, None) when sb.Length = 0 -> { cs with Quote = Open } | |
| struct ('\"', _, None) -> failwithf "Unescaped quote in cell %A" pos.Value | |
| struct ('\"', _, Open) -> { cs with Quote = Closed } | |
| struct ('\"', _, Closed) | struct ( _, _, Open) -> includeWhileQuoteIs Open | |
| struct ( ',', _, Closed) | struct ( ',', _, None) -> finalizeCell cs | |
| struct ('\r', _, Closed) | struct ('\r', _, None) -> cs | |
| struct ('\n', _, Closed) | struct ('\n', _, None) -> finalizeRow cs | |
| struct ( _, _, Closed) -> failwithf "Extra char '%c' outside of cell %A" c pos.Value | |
| struct ( _, _, None) -> includeWhileQuoteIs None) (empty ()) | |
|> finalizeTable | |
let parseCsvFile enc path = | |
let stream = let s = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite) | |
new StreamReader(s, encoding = enc, detectEncodingFromByteOrderMarks = false) | |
using stream parseCsv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* | |
* This work (My Shortest CSV Parser:CsvLazy.fs by James Tuley), | |
* identified by James Tuley, is free of known copyright restrictions | |
* Source: https://gist.github.com/jbtule/83e330335b1afc924d508ce49b0927fc | |
* http://creativecommons.org/publicdomain/mark/1.0/ | |
*) | |
module CsvLazy | |
open System.IO | |
type SB = System.Text.StringBuilder | |
[<Struct>] type internal Quote = None | Open | Closed | |
[<Struct>] type internal CsvState = { Yield: string list; Len:int; Row: string list; Cell: SB; Quote: Quote } | |
let parseCsv (read:TextReader) = | |
let empty len = {Yield=[]; Len=len; Row = []; Cell = SB(); Quote = None } | |
let finalizeCell = function | { Row = []; } as cs when cs.Cell.Length = 0 -> empty cs.Len | |
| cs -> { empty cs.Len with Row = (cs.Cell.ToString())::cs.Row } | |
let finalizeRow = finalizeCell >> function | { Row = [] } as cs -> empty cs.Len | |
| cs -> { empty (cs.Len + 1) with Yield = (List.rev cs.Row) } | |
seq { let mutable ms = empty 0 | |
for c in seq { while read.Peek() >= 0 do yield read.Read() |> char } do | |
let pos = lazy (ms.Len + 1, ms.Row.Length + 1) | |
let includeWhileQuoteIs q = { ms with Cell = ms.Cell.Append(c); Quote = q } | |
match struct (c, ms.Cell, ms.Quote) with | |
| struct ('\"', sb, None) when sb.Length = 0 -> ms <- { ms with Quote = Open } | |
| struct ('\"', _, None) -> failwithf "Unescaped quote in cell %A" pos.Value | |
| struct ('\"', _, Open) -> ms <- { ms with Quote = Closed } | |
| struct ('\"', _, Closed) | struct ( _, _, Open) -> ms <- includeWhileQuoteIs Open | |
| struct ( ',', _, Closed) | struct ( ',', _, None) -> ms <- finalizeCell ms | |
| struct ('\r', _, Closed) | struct ('\r', _, None) -> () | |
| struct ('\n', _, Closed) | struct ('\n', _, None) -> ms <- finalizeRow ms; yield ms.Yield; | |
| struct ( _, _, Closed) -> failwithf "Extra char '%c' outside of cell %A" c pos.Value | |
| struct ( _, _, None) -> ms <- includeWhileQuoteIs None } | |
let parseCsvFile enc path = | |
seq { use stream = let s = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite) | |
new StreamReader(s, encoding = enc, detectEncodingFromByteOrderMarks = false) | |
for row in parseCsv stream do yield row } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment