Skip to content

Instantly share code, notes, and snippets.

@amitkgupta
Last active August 29, 2015 14:06
Show Gist options
  • Save amitkgupta/30d1f6c8f49ba8d23124 to your computer and use it in GitHub Desktop.
Save amitkgupta/30d1f6c8f49ba8d23124 to your computer and use it in GitHub Desktop.
kNN Classifications on a large dataset with different packages.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
package main
import (
"fmt"
"github.com/amitkgupta/goodlearn/classifier/knn"
"github.com/amitkgupta/goodlearn/csvparse"
"github.com/amitkgupta/goodlearn/data/row"
"runtime"
"flag"
"log"
"os"
"runtime/pprof"
)
var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
func main() {
flag.Parse()
if *cpuprofile != "" {
f, err := os.Create(*cpuprofile)
if err != nil {
log.Fatal(err)
}
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()
}
runtime.GOMAXPROCS(runtime.NumCPU())
println("parsing test")
validationSample, _ := csvparse.DatasetFromPath("many_features_test.csv", 0, 1)
println("parsed test")
println("parsing training")
trainingSample, _ := csvparse.DatasetFromPath("many_features_training.csv", 0, 1)
println("parsed training")
c, _ := knn.NewKNNClassifier(1)
c.Train(trainingSample)
var totalCorrect float32 = 0
successChannel := make(chan float32, 10000) //len(validationSample))
for i := 0; i < 10000; i++ {
test, _ := validationSample.Row(i)
go func(t row.Row, j int) {
if j%5 == 0 {
println("classifying", j)
}
cl, _ := c.Classify(test)
if cl.Equals(test.Target()) {
successChannel <- 1
} else {
successChannel <- 0
}
if j%5 == 0 {
println("classified", j)
}
}(test, i)
}
for i := 0; i < 10000; i++ { //len(validationSample); i++ {
totalCorrect += <-successChannel
}
fmt.Println(float32(totalCorrect)) // / float32(len(validationSample)))
}
// Takes about 50s on the beefy machine, *with* goodlearn optimized
// - replace math.Pow(x,2) with x*x in distance.go
// - SliceFromStrings was super slow, but assuming contiguous features and all of them are floats, much faster
// Outputs unreasonable answer: 10000
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
package main
import (
"bytes"
"flag"
"fmt"
"io/ioutil"
"log"
"math"
"os"
"runtime"
"runtime/pprof"
"strconv"
)
type LabelWithFeatures struct {
Label []byte
Features []float64
}
func NewLabelWithFeatures(parsedLine [][]byte) LabelWithFeatures {
label := parsedLine[0]
features := make([]float64, len(parsedLine)-1)
for i, feature := range parsedLine {
// skip label
if i == 0 {
continue
}
features[i-1] = byteSliceTofloat64(feature)
}
return LabelWithFeatures{label, features}
}
var newline = []byte("\n")
var comma = []byte(",")
func byteSliceTofloat64(b []byte) float64 {
x, _ := strconv.ParseFloat(string(b), 32)
return float64(x)
}
func parseCSVFile(filePath string) []LabelWithFeatures {
fileContent, _ := ioutil.ReadFile(filePath)
lines := bytes.Split(fileContent, newline)
numRows := len(lines)
labelsWithFeatures := make([]LabelWithFeatures, numRows-2)
for i, line := range lines {
// skip headers
if i == 0 || i == numRows-1 {
continue
}
labelsWithFeatures[i-1] = NewLabelWithFeatures(bytes.Split(line, comma))
}
return labelsWithFeatures
}
func squareDistanceWithBailout(features1, features2 []float64, bailout float64) (d float64) {
for i := 0; i < len(features1); i++ {
x := features1[i] - features2[i]
d += x * x
if d > bailout {
break
}
}
return
}
var trainingSample = parseCSVFile("many_features_training.csv")
func classify(features []float64) (label []byte) {
label = trainingSample[0].Label
d := squareDistanceWithBailout(features, trainingSample[0].Features, math.MaxFloat32)
for _, row := range trainingSample {
dNew := squareDistanceWithBailout(features, row.Features, d)
if dNew < d {
label = row.Label
d = dNew
}
}
return
}
var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
func main() {
flag.Parse()
if *cpuprofile != "" {
f, err := os.Create(*cpuprofile)
if err != nil {
log.Fatal(err)
}
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()
}
runtime.GOMAXPROCS(runtime.NumCPU())
println("parsing test")
validationSample := parseCSVFile("many_features_test.csv")
println("parsed test")
var totalCorrect float64 = 0
successChannel := make(chan float64, 10000) //len(validationSample))
for i := 0; i < 10000; i++ {
test := validationSample[i]
go func(t LabelWithFeatures, j int) {
if j%5 == 0 {
println("classifying", j)
}
if string(t.Label) == string(classify(t.Features)) {
successChannel <- 1
} else {
successChannel <- 0
}
if j%5 == 0 {
println("classified", j)
}
}(test, i)
}
for i := 0; i < 10000; i++ { //len(validationSample); i++ {
totalCorrect += <-successChannel
}
fmt.Println(float64(totalCorrect)) // / float64(len(validationSample)))
}
// Runs in about 10s on the beefy machine
// Has some reasonable number of correct, like 98k
import numpy
from sklearn.neighbors import KNeighborsClassifier
f = open("many_features_test.csv")
f.readline() # ignore headers
test = numpy.loadtxt(f, delimiter=',')
ff = open("many_features_training.csv")
ff.readline() # ignore headers
training = numpy.loadtxt(ff, delimiter=',')
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(training[:, [1,128]], numpy.ravel(training[:, [0]]))
numpy.sum(knn.predict(test[:, [1,128]]) == numpy.ravel(test[:, [0]]))
# Lightning fast, but...
# Something shitty, like 534 (out of ~43000)
knn = KNeighborsClassifier(n_neighbors=1, algorithm='brute')
knn.fit(training[:, [1,128]], numpy.ravel(training[:, [0]]))
numpy.sum(knn.predict(test[:, [1,128]]) == numpy.ravel(test[:, [0]]))
# Takes a while, and then...
# Something shittier, like MemoryError
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment