Last active
October 12, 2016 08:00
-
-
Save pirrmann/9ed0981c33158111939ba2550dc566de to your computer and use it in GitHub Desktop.
Titanic CSV loading
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
let file = System.IO.File.ReadAllText(__SOURCE_DIRECTORY__ + "/training.csv") | |
let parseLine line = | |
let makeWord = Seq.rev >> Seq.toArray >> System.String | |
let rec parseFields chars = | |
match chars with | |
| '\"' :: chars' -> parseEscaped [] chars' | |
| chars' -> parseUnescaped [] chars' | |
and parseEscaped acc chars = seq { | |
match chars with | |
| '\"' :: '\"' :: chars' -> | |
yield! parseEscaped ('\"' :: acc) chars' | |
| '\"' :: ',' :: chars' -> | |
yield acc |> makeWord | |
yield! parseFields chars' | |
| '\"' :: [] -> | |
yield acc |> makeWord | |
| c :: chars' -> | |
yield! parseEscaped (c:: acc) chars' | |
| [] -> failwith "The file is malformed!" } | |
and parseUnescaped acc chars = seq { | |
match chars with | |
| [] -> | |
yield acc |> makeWord | |
| ',' :: chars' -> | |
yield acc |> makeWord | |
yield! parseFields chars' | |
| c :: chars' -> | |
yield! parseUnescaped (c:: acc) chars' } | |
line |> Seq.toList |> parseFields |> Seq.toArray | |
file |> Array.map parseLine |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
let parseLine2 = | |
let regex = new System.Text.RegularExpressions.Regex("(^|,)((\"(?<escaped>((\"\"|[^\"])*))\")|(?<unescaped>([^\",]*)))(?=($|,))") | |
fun line -> | |
[| | |
for m in regex.Matches(line) do | |
if m.Groups.["unescaped"].Success then | |
yield m.Groups.["unescaped"].Value | |
else | |
yield m.Groups.["escaped"].Value.Replace("\"\"", "\"") | |
|] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#r "../packages/FSharp.Data/lib/net40/FSharp.Data.dll" | |
open FSharp.Data | |
let file = System.IO.File.ReadAllText(__SOURCE_DIRECTORY__ + "/training.csv") | |
type Titanic = CsvProvider<"training.csv", HasHeaders = true> | |
let lines = | |
Titanic.Parse(file).Rows | |
|> Seq.map (fun l -> l.Name) | |
|> Seq.toArray | |
printf "%A" lines |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment