Created
          October 28, 2014 05:01 
        
      - 
      
- 
        Save mathias-brandewinder/07a94a584973f5155cf1 to your computer and use it in GitHub Desktop. 
    Digits & Streams (experimenting)
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | #r @"../packages/Streams.0.2.0/lib/Streams.Core.dll" | |
| open Nessos.Streams.Core | |
| open System.IO | |
| (* | |
| Original file here: 50,000 scans of digits | |
| Col 1 is the actual number, rest is 28x28 pixels, grayscale encoded | |
| http://1drv.ms/ZWkcX6 | |
| *) | |
| let path = @"c:/users/mathias brandewinder/desktop/train.csv" | |
| // basic read | |
| let reader path = | |
| File.ReadAllLines path | |
| |> fun x -> x.[1..] | |
| |> Array.map (fun line -> line.Split(',')) | |
| |> Array.map (fun line -> line |> Array.map int) | |
| |> Array.map (fun line -> line.[0], line.[1..]) | |
| // Real: 00:00:09.046, CPU: 00:00:09.531, GC gen0: 275, gen1: 262, gen2: 14 | |
| let data = reader path | |
| // read using streams | |
| let streamer path = | |
| File.ReadAllLines path | |
| |> Stream.ofArray | |
| |> Stream.skip 1 | |
| |> Stream.map (fun line -> line.Split(',')) | |
| |> Stream.map (fun line -> line |> Array.map int) | |
| |> Stream.map (fun line -> line.[0], line.[1..]) | |
| |> Stream.toArray | |
| // Real: 00:00:05.001, CPU: 00:00:04.781, GC gen0: 265, gen1: 261, gen2: 4 | |
| let streamed = streamer path | |
| // separating training & validation set | |
| let training = streamed.[..39999] | |
| let validation = streamed.[40000..] | |
| let distance X Y = | |
| (X,Y) | |
| ||> Seq.zip | |
| |> Seq.sumBy (fun (x,y) -> pown (x-y) 2) | |
| type Classifier = int [] -> int | |
| (* | |
| KNN classifier: for a given image, find the | |
| k closest, look up their label, and predict | |
| the most frequent one. | |
| *) | |
| let classicKnn n = | |
| // find the n closest neighbors | |
| // and return the most frequent label | |
| let classifier (x:int[]) = | |
| training | |
| |> Seq.sortBy (fun ex -> distance (snd ex) x) | |
| |> Seq.take n | |
| |> Seq.map fst | |
| |> Seq.countBy id | |
| |> Seq.maxBy snd | |
| |> fst | |
| classifier | |
| let streamKnn n = | |
| let classifier (x:int[]) = | |
| training | |
| |> Stream.ofArray | |
| |> Stream.sortBy (fun ex -> distance (snd ex) x) | |
| |> Stream.take n | |
| |> Stream.map fst | |
| |> Stream.toArray | |
| |> Seq.countBy id | |
| |> Seq.maxBy snd | |
| |> fst | |
| classifier | |
| let parallelKnn n = | |
| let classifier (x:int[]) = | |
| training | |
| |> ParStream.ofArray | |
| |> ParStream.sortBy (fun ex -> distance (snd ex) x) | |
| |> ParStream.toArray | |
| |> Seq.take n | |
| |> Seq.map fst | |
| |> Seq.countBy id | |
| |> Seq.maxBy snd | |
| |> fst | |
| classifier | |
| (* | |
| Evaluate a classifier: take n images with known true value | |
| (here 100 for the sake of speed), compare the predicted | |
| value with the actual, and compute the % correctly predicted. | |
| Ideally I would like to run all 10,000. | |
| *) | |
| let evaluate (c:Classifier) = | |
| validation | |
| |> Seq.take 100 | |
| |> Seq.averageBy (fun ex -> | |
| if c (snd ex) = fst ex then 1. else 0.) | |
| // Real: 00:02:54.521, CPU: 00:02:54.421, GC gen0: 9667, gen1: 19, gen2: 2 | |
| evaluate (classicKnn 10) | |
| // Real: 00:02:55.599, CPU: 00:02:55.500, GC gen0: 9672, gen1: 21, gen2: 2 | |
| evaluate (streamKnn 10) | |
| // Real: 00:00:51.649, CPU: 00:02:56.812, GC gen0: 9718, gen1: 2867, gen2: 2 | |
| evaluate (parallelKnn 10) | |
| let parallelEvaluate (c:Classifier) = | |
| validation | |
| |> fun x -> x.[..99] | |
| |> ParStream.ofArray | |
| |> ParStream.map (fun ex -> | |
| if c (snd ex) = fst ex then 1. else 0.) | |
| |> ParStream.sum | |
| |> fun x -> x / 100. | |
| // Real: 00:00:53.795, CPU: 00:02:56.468, GC gen0: 9689, gen1: 9688, gen2: 1 | |
| parallelEvaluate (classicKnn 10) | |
| // Real: 00:00:50.524, CPU: 00:02:53.593, GC gen0: 9695, gen1: 9694, gen2: 1 | |
| parallelEvaluate (streamKnn 10) | |
| // Real: 00:01:17.829, CPU: 00:03:08.718, GC gen0: 9729, gen1: 9718, gen2: 10 | |
| parallelEvaluate (parallelKnn 10) | 
One more thing that we need to address is the missing combinators (Stream/ParStream.countBy, Stream/ParStream.maxBy and ParStream.take).
Because you lose performance when you are forced to leave the Stream composition.
The performance degradation of "parallelEvaluate (parallelKnn 10)" is somewhat expected,
because you are oversubscribing parallel work.
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment
  
            
You can try Streams 0.2.2, it addresses some ParStream perf issues.
https://www.nuget.org/packages/Streams/