Last active
August 29, 2015 14:06
-
-
Save amitkgupta/30d1f6c8f49ba8d23124 to your computer and use it in GitHub Desktop.
kNN Classifications on a large dataset with different packages.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"github.com/amitkgupta/goodlearn/classifier/knn" | |
"github.com/amitkgupta/goodlearn/csvparse" | |
"github.com/amitkgupta/goodlearn/data/row" | |
"runtime" | |
"flag" | |
"log" | |
"os" | |
"runtime/pprof" | |
) | |
var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file") | |
func main() { | |
flag.Parse() | |
if *cpuprofile != "" { | |
f, err := os.Create(*cpuprofile) | |
if err != nil { | |
log.Fatal(err) | |
} | |
pprof.StartCPUProfile(f) | |
defer pprof.StopCPUProfile() | |
} | |
runtime.GOMAXPROCS(runtime.NumCPU()) | |
println("parsing test") | |
validationSample, _ := csvparse.DatasetFromPath("many_features_test.csv", 0, 1) | |
println("parsed test") | |
println("parsing training") | |
trainingSample, _ := csvparse.DatasetFromPath("many_features_training.csv", 0, 1) | |
println("parsed training") | |
c, _ := knn.NewKNNClassifier(1) | |
c.Train(trainingSample) | |
var totalCorrect float32 = 0 | |
successChannel := make(chan float32, 10000) //len(validationSample)) | |
for i := 0; i < 10000; i++ { | |
test, _ := validationSample.Row(i) | |
go func(t row.Row, j int) { | |
if j%5 == 0 { | |
println("classifying", j) | |
} | |
cl, _ := c.Classify(test) | |
if cl.Equals(test.Target()) { | |
successChannel <- 1 | |
} else { | |
successChannel <- 0 | |
} | |
if j%5 == 0 { | |
println("classified", j) | |
} | |
}(test, i) | |
} | |
for i := 0; i < 10000; i++ { //len(validationSample); i++ { | |
totalCorrect += <-successChannel | |
} | |
fmt.Println(float32(totalCorrect)) // / float32(len(validationSample))) | |
} | |
// Takes about 50s on the beefy machine, *with* goodlearn optimized | |
// - replace math.Pow(x,2) with x*x in distance.go | |
// - SliceFromStrings was super slow, but assuming contiguous features and all of them are floats, much faster | |
// Outputs unreasonable answer: 10000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bytes" | |
"flag" | |
"fmt" | |
"io/ioutil" | |
"log" | |
"math" | |
"os" | |
"runtime" | |
"runtime/pprof" | |
"strconv" | |
) | |
type LabelWithFeatures struct { | |
Label []byte | |
Features []float64 | |
} | |
func NewLabelWithFeatures(parsedLine [][]byte) LabelWithFeatures { | |
label := parsedLine[0] | |
features := make([]float64, len(parsedLine)-1) | |
for i, feature := range parsedLine { | |
// skip label | |
if i == 0 { | |
continue | |
} | |
features[i-1] = byteSliceTofloat64(feature) | |
} | |
return LabelWithFeatures{label, features} | |
} | |
var newline = []byte("\n") | |
var comma = []byte(",") | |
func byteSliceTofloat64(b []byte) float64 { | |
x, _ := strconv.ParseFloat(string(b), 32) | |
return float64(x) | |
} | |
func parseCSVFile(filePath string) []LabelWithFeatures { | |
fileContent, _ := ioutil.ReadFile(filePath) | |
lines := bytes.Split(fileContent, newline) | |
numRows := len(lines) | |
labelsWithFeatures := make([]LabelWithFeatures, numRows-2) | |
for i, line := range lines { | |
// skip headers | |
if i == 0 || i == numRows-1 { | |
continue | |
} | |
labelsWithFeatures[i-1] = NewLabelWithFeatures(bytes.Split(line, comma)) | |
} | |
return labelsWithFeatures | |
} | |
func squareDistanceWithBailout(features1, features2 []float64, bailout float64) (d float64) { | |
for i := 0; i < len(features1); i++ { | |
x := features1[i] - features2[i] | |
d += x * x | |
if d > bailout { | |
break | |
} | |
} | |
return | |
} | |
var trainingSample = parseCSVFile("many_features_training.csv") | |
func classify(features []float64) (label []byte) { | |
label = trainingSample[0].Label | |
d := squareDistanceWithBailout(features, trainingSample[0].Features, math.MaxFloat32) | |
for _, row := range trainingSample { | |
dNew := squareDistanceWithBailout(features, row.Features, d) | |
if dNew < d { | |
label = row.Label | |
d = dNew | |
} | |
} | |
return | |
} | |
var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file") | |
func main() { | |
flag.Parse() | |
if *cpuprofile != "" { | |
f, err := os.Create(*cpuprofile) | |
if err != nil { | |
log.Fatal(err) | |
} | |
pprof.StartCPUProfile(f) | |
defer pprof.StopCPUProfile() | |
} | |
runtime.GOMAXPROCS(runtime.NumCPU()) | |
println("parsing test") | |
validationSample := parseCSVFile("many_features_test.csv") | |
println("parsed test") | |
var totalCorrect float64 = 0 | |
successChannel := make(chan float64, 10000) //len(validationSample)) | |
for i := 0; i < 10000; i++ { | |
test := validationSample[i] | |
go func(t LabelWithFeatures, j int) { | |
if j%5 == 0 { | |
println("classifying", j) | |
} | |
if string(t.Label) == string(classify(t.Features)) { | |
successChannel <- 1 | |
} else { | |
successChannel <- 0 | |
} | |
if j%5 == 0 { | |
println("classified", j) | |
} | |
}(test, i) | |
} | |
for i := 0; i < 10000; i++ { //len(validationSample); i++ { | |
totalCorrect += <-successChannel | |
} | |
fmt.Println(float64(totalCorrect)) // / float64(len(validationSample))) | |
} | |
// Runs in about 10s on the beefy machine | |
// Has some reasonable number of correct, like 98k |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy | |
from sklearn.neighbors import KNeighborsClassifier | |
f = open("many_features_test.csv") | |
f.readline() # ignore headers | |
test = numpy.loadtxt(f, delimiter=',') | |
ff = open("many_features_training.csv") | |
ff.readline() # ignore headers | |
training = numpy.loadtxt(ff, delimiter=',') | |
knn = KNeighborsClassifier(n_neighbors=1) | |
knn.fit(training[:, [1,128]], numpy.ravel(training[:, [0]])) | |
numpy.sum(knn.predict(test[:, [1,128]]) == numpy.ravel(test[:, [0]])) | |
# Lightning fast, but... | |
# Something shitty, like 534 (out of ~43000) | |
knn = KNeighborsClassifier(n_neighbors=1, algorithm='brute') | |
knn.fit(training[:, [1,128]], numpy.ravel(training[:, [0]])) | |
numpy.sum(knn.predict(test[:, [1,128]]) == numpy.ravel(test[:, [0]])) | |
# Takes a while, and then... | |
# Something shittier, like MemoryError |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment