Last active
December 28, 2015 04:29
-
-
Save bohdanszymanik/7442499 to your computer and use it in GitHub Desktop.
Kaggle numeric character recognition in F# using:
1. CsvFile typeprovider to get the data
2. vectors with cosine similarity to determine k nearest neighbours and 3. the ability to display characters with wpf using wpf Rectangles laid out onto a canvas. Based on the machine learning example from Mathias Brandewinder and the coding dojo here: http…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open System | |
#r @"../packages/FSharp.Data.1.1.10/lib/net40/FSharp.Data.dll" | |
#r @"c:\wd\MathDemo\packages\MathNet.Numerics.2.6.2\lib\net40\MathNet.Numerics.dll" | |
#r "../packages/MathNet.Numerics.FSharp.2.6.0/lib/net40/MathNet.Numerics.FSharp.dll" | |
open MathNet.Numerics.LinearAlgebra | |
open MathNet.Numerics.LinearAlgebra.Double | |
(* CsvProvider nice but you can't enumerate the row columns | |
//type trainingChars = FSharp.Data.CsvProvider<"http://brandewinder.blob.core.windows.net/public/trainingsample.csv"> | |
type trainingChars = FSharp.Data.CsvProvider<"c://temp//trainingsample.csv"> | |
//let tChars = trainingChars.Load("http://brandewinder.blob.core.windows.net/public/trainingsample.csv") | |
let tChars = trainingChars.Load(@"c:/temp/trainingsample.csv") | |
tChars.Data |> Seq.head | |
(tChars.Data |> Seq.head).label | |
(tChars.Data |> Seq.head).pixel0 | |
*) | |
// for large files such as these character files could be, the CsvFile reader seems more sensible than | |
// the CsvProvider - CsvFile represents each row as a string array and doesnt cache by default. | |
open FSharp.Data.Csv | |
open FSharp.Data.Csv.Extensions | |
//let tChars = CsvFile.Load("http://brandewinder.blob.core.windows.net/public/trainingsample.csv") | |
let rawTrainCases = CsvFile.Load(@"c:/temp/trainingsample.csv") | |
(* some sample usage | |
(rawTrainCases.Data |> Seq.head).Columns | |
(rawTrainCases.Data |> Seq.head).GetColumn("pixel1") | |
(rawTrainCases.Data |> Seq.head).Columns.[0] | |
(rawTrainCases.Data |> Seq.head).["pixel1"].AsInteger() | |
*) | |
// define a type that represents each data point | |
type Number = { Label: int; Pixels: int[] } | |
type Number1 = { Label: int; Pixels1: DenseVector } | |
let rawCaseToNumber1 (raw:string[]) = | |
let a = raw.[1..] |> Array.map Convert.ToDouble | |
{Label = (int)raw.[0]; Pixels1 = DenseVector (raw.[1..] |> Array.map Convert.ToDouble) } | |
let trainCases1 = | |
(rawTrainCases.Data) | |
|> Seq.map (fun r -> r.Columns) | |
|> Seq.map rawCaseToNumber1 | |
|> Seq.cache | |
// distance measures | |
// euclidean approach - don't worry about the sqr to avoid the extra computation | |
let euclideanDistance (a:int) (b:int) = (a-b)*(a-b) | |
// another approach - cosine similarity that nicely ranges from (-1 to) 0 to 1 | |
let cosineSimilarity (x:DenseVector) (y:DenseVector) = | |
sqrt(1.0 / x.DotProduct(x) / y.DotProduct(y)) * x.DotProduct(y) | |
cosineSimilarity (DenseVector [|1.0; 2.; 3.|]) (DenseVector [|2.0; 4.; 6.|]) | |
// for any unknown record in test set, find similarity to all records in training set | |
// return n nearest neighbours | |
let rawTestCases = CsvFile.Load(@"c:/temp/validationsample.csv") | |
let testCases1 = | |
(rawTestCases.Data) | |
|> Seq.map (fun r -> r.Columns) | |
|> Seq.map rawCaseToNumber1 | |
|> Seq.cache | |
let findkNNCases k (knownCases:seq<Number1>) (unknownCase:Number1) = | |
knownCases | |
|> Seq.map (fun n -> (n.Label, cosineSimilarity n.Pixels1 unknownCase.Pixels1) ) | |
|> Seq.sortBy (fun (l,d) -> d ) | |
|> List.ofSeq | |
|> List.rev | |
|> Seq.take k | |
let testOurTestCases = | |
testCases1 | |
//|> Seq.take 10 | |
|> Seq.map (fun n -> | |
let closestCases = findkNNCases 10 trainCases1 n | |
(n, closestCases, ( closestCases |> Seq.countBy (fun (c,d) -> c ) |> Seq.maxBy ( fun (_,cnt) -> cnt ) ) ) | |
) | |
// how accurate were we? | |
testOurTestCases | |
|> Seq.countBy (fun (n, _, (predicted,_) ) -> n.Label = predicted) | |
// turns out to be 94% accurate 471/500 predicted correctly, 29 falsely | |
// what do the failed test cases look like? | |
#r "WindowsBase" | |
#r "PresentationCore" | |
#r "PresentationFramework" | |
#r "System.Xaml" | |
open System | |
open System.Windows | |
open System.Windows.Controls | |
open System.Windows.Shapes | |
open System.Windows.Media | |
open System.Xaml | |
let drawPixels (someCase:Number1) title = | |
let w = new Window(Topmost=true) | |
w.Width <- 280. | |
w.Height <- 308. | |
w.Title <- title | |
w.Show() | |
let c = new Canvas() | |
w.Content <- c | |
someCase.Pixels1 | |
|> Seq.iteri (fun i p -> | |
let scb = new SolidColorBrush() | |
scb.Color <- Color.FromRgb (0uy,0uy,(byte)p) | |
let r = new Rectangle(Width=10., Height=10., Fill=scb) | |
c.Children.Add(r) |> ignore | |
Canvas.SetLeft(r, Convert.ToDouble((i % 28) * 10)) | |
Canvas.SetTop(r, Convert.ToDouble( i/28 * 10) ) | |
) | |
// draw up the first of our cases | |
drawPixels (testCases1 |> Seq.head) "first case" | |
// draw up our failed cases | |
testOurTestCases | |
|> Seq.filter (fun (n, _, (predicted,_) ) -> n.Label <> predicted) | |
|> Seq.take 5 | |
|> Seq.iter (fun (n, _, (_,_) ) -> drawPixels n (Convert.ToString(n.Label)) ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment