Skip to content

Instantly share code, notes, and snippets.

@mathias-brandewinder
Created October 28, 2014 05:01
Show Gist options
  • Save mathias-brandewinder/07a94a584973f5155cf1 to your computer and use it in GitHub Desktop.
Save mathias-brandewinder/07a94a584973f5155cf1 to your computer and use it in GitHub Desktop.
Digits & Streams (experimenting)
#r @"../packages/Streams.0.2.0/lib/Streams.Core.dll"
open Nessos.Streams.Core
open System.IO
(*
Original file here: 50,000 scans of digits
Col 1 is the actual number, rest is 28x28 pixels, grayscale encoded
http://1drv.ms/ZWkcX6
*)
let path = @"c:/users/mathias brandewinder/desktop/train.csv"
// basic read
let reader path =
File.ReadAllLines path
|> fun x -> x.[1..]
|> Array.map (fun line -> line.Split(','))
|> Array.map (fun line -> line |> Array.map int)
|> Array.map (fun line -> line.[0], line.[1..])
// Real: 00:00:09.046, CPU: 00:00:09.531, GC gen0: 275, gen1: 262, gen2: 14
let data = reader path
// read using streams
let streamer path =
File.ReadAllLines path
|> Stream.ofArray
|> Stream.skip 1
|> Stream.map (fun line -> line.Split(','))
|> Stream.map (fun line -> line |> Array.map int)
|> Stream.map (fun line -> line.[0], line.[1..])
|> Stream.toArray
// Real: 00:00:05.001, CPU: 00:00:04.781, GC gen0: 265, gen1: 261, gen2: 4
let streamed = streamer path
// separating training & validation set
let training = streamed.[..39999]
let validation = streamed.[40000..]
let distance X Y =
(X,Y)
||> Seq.zip
|> Seq.sumBy (fun (x,y) -> pown (x-y) 2)
type Classifier = int [] -> int
(*
KNN classifier: for a given image, find the
k closest, look up their label, and predict
the most frequent one.
*)
let classicKnn n =
// find the n closest neighbors
// and return the most frequent label
let classifier (x:int[]) =
training
|> Seq.sortBy (fun ex -> distance (snd ex) x)
|> Seq.take n
|> Seq.map fst
|> Seq.countBy id
|> Seq.maxBy snd
|> fst
classifier
let streamKnn n =
let classifier (x:int[]) =
training
|> Stream.ofArray
|> Stream.sortBy (fun ex -> distance (snd ex) x)
|> Stream.take n
|> Stream.map fst
|> Stream.toArray
|> Seq.countBy id
|> Seq.maxBy snd
|> fst
classifier
let parallelKnn n =
let classifier (x:int[]) =
training
|> ParStream.ofArray
|> ParStream.sortBy (fun ex -> distance (snd ex) x)
|> ParStream.toArray
|> Seq.take n
|> Seq.map fst
|> Seq.countBy id
|> Seq.maxBy snd
|> fst
classifier
(*
Evaluate a classifier: take n images with known true value
(here 100 for the sake of speed), compare the predicted
value with the actual, and compute the % correctly predicted.
Ideally I would like to run all 10,000.
*)
let evaluate (c:Classifier) =
validation
|> Seq.take 100
|> Seq.averageBy (fun ex ->
if c (snd ex) = fst ex then 1. else 0.)
// Real: 00:02:54.521, CPU: 00:02:54.421, GC gen0: 9667, gen1: 19, gen2: 2
evaluate (classicKnn 10)
// Real: 00:02:55.599, CPU: 00:02:55.500, GC gen0: 9672, gen1: 21, gen2: 2
evaluate (streamKnn 10)
// Real: 00:00:51.649, CPU: 00:02:56.812, GC gen0: 9718, gen1: 2867, gen2: 2
evaluate (parallelKnn 10)
let parallelEvaluate (c:Classifier) =
validation
|> fun x -> x.[..99]
|> ParStream.ofArray
|> ParStream.map (fun ex ->
if c (snd ex) = fst ex then 1. else 0.)
|> ParStream.sum
|> fun x -> x / 100.
// Real: 00:00:53.795, CPU: 00:02:56.468, GC gen0: 9689, gen1: 9688, gen2: 1
parallelEvaluate (classicKnn 10)
// Real: 00:00:50.524, CPU: 00:02:53.593, GC gen0: 9695, gen1: 9694, gen2: 1
parallelEvaluate (streamKnn 10)
// Real: 00:01:17.829, CPU: 00:03:08.718, GC gen0: 9729, gen1: 9718, gen2: 10
parallelEvaluate (parallelKnn 10)
@palladin
Copy link

The performance degradation of "parallelEvaluate (parallelKnn 10)" is somewhat expected,
because you are oversubscribing parallel work.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment