amitkgupta · August 29, 2015 14:06
diff --git a/goodlearn-knn-perf.svg b/goodlearn-knn-perf.svg
diff --git a/goodlearn-knn.go b/goodlearn-knn.go
 package main

 import (
 	"fmt"
 	"github.com/amitkgupta/goodlearn/classifier/knn"
 	"github.com/amitkgupta/goodlearn/csvparse"
 	"github.com/amitkgupta/goodlearn/data/row"
 	"runtime"

 	"flag"
 	"log"
 	"os"
 	"runtime/pprof"
 )

 var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")

 func main() {
 	flag.Parse()
 	if *cpuprofile != "" {
 		f, err := os.Create(*cpuprofile)
 		if err != nil {
 			log.Fatal(err)
 		}
 		pprof.StartCPUProfile(f)
 		defer pprof.StopCPUProfile()
 	}
 	runtime.GOMAXPROCS(runtime.NumCPU())

 	println("parsing test")
 	validationSample, _ := csvparse.DatasetFromPath("many_features_test.csv", 0, 1)
 	println("parsed test")
 	println("parsing training")
 	trainingSample, _ := csvparse.DatasetFromPath("many_features_training.csv", 0, 1)
 	println("parsed training")

 	c, _ := knn.NewKNNClassifier(1)
 	c.Train(trainingSample)

 	var totalCorrect float32 = 0
 	successChannel := make(chan float32, 10000) //len(validationSample))

 	for i := 0; i < 10000; i++ {
 		test, _ := validationSample.Row(i)
 		go func(t row.Row, j int) {
 			if j%5 == 0 {
 				println("classifying", j)
 			}
 			cl, _ := c.Classify(test)
 			if cl.Equals(test.Target()) {
 				successChannel <- 1
 			} else {
 				successChannel <- 0
 			}
 			if j%5 == 0 {
 				println("classified", j)
 			}
 		}(test, i)
 	}

 	for i := 0; i < 10000; i++ { //len(validationSample); i++ {
 		totalCorrect += <-successChannel
 	}

 	fmt.Println(float32(totalCorrect)) // / float32(len(validationSample)))
 }

 // Takes about 50s on the beefy machine, *with* goodlearn optimized
 // - replace math.Pow(x,2) with x*x in distance.go
 // - SliceFromStrings was super slow, but assuming contiguous features and all of them are floats, much faster
 // Outputs unreasonable answer: 10000
diff --git a/raw-go-knn-perf.svg b/raw-go-knn-perf.svg
diff --git a/raw-go-knn.go b/raw-go-knn.go
 package main

 import (
 	"bytes"
 	"flag"
 	"fmt"
 	"io/ioutil"
 	"log"
 	"math"
 	"os"
 	"runtime"
 	"runtime/pprof"
 	"strconv"
 )

 type LabelWithFeatures struct {
 	Label    []byte
 	Features []float64
 }

 func NewLabelWithFeatures(parsedLine [][]byte) LabelWithFeatures {
 	label := parsedLine[0]
 	features := make([]float64, len(parsedLine)-1)

 	for i, feature := range parsedLine {
 		// skip label
 		if i == 0 {
 			continue
 		}

 		features[i-1] = byteSliceTofloat64(feature)
 	}

 	return LabelWithFeatures{label, features}
 }

 var newline = []byte("\n")
 var comma = []byte(",")

 func byteSliceTofloat64(b []byte) float64 {
 	x, _ := strconv.ParseFloat(string(b), 32)
 	return float64(x)
 }

 func parseCSVFile(filePath string) []LabelWithFeatures {
 	fileContent, _ := ioutil.ReadFile(filePath)
 	lines := bytes.Split(fileContent, newline)
 	numRows := len(lines)

 	labelsWithFeatures := make([]LabelWithFeatures, numRows-2)

 	for i, line := range lines {
 		// skip headers
 		if i == 0 || i == numRows-1 {
 			continue
 		}

 		labelsWithFeatures[i-1] = NewLabelWithFeatures(bytes.Split(line, comma))
 	}

 	return labelsWithFeatures
 }

 func squareDistanceWithBailout(features1, features2 []float64, bailout float64) (d float64) {
 	for i := 0; i < len(features1); i++ {
 		x := features1[i] - features2[i]
 		d += x * x

 		if d > bailout {
 			break
 		}
 	}

 	return
 }

 var trainingSample = parseCSVFile("many_features_training.csv")

 func classify(features []float64) (label []byte) {
 	label = trainingSample[0].Label
 	d := squareDistanceWithBailout(features, trainingSample[0].Features, math.MaxFloat32)

 	for _, row := range trainingSample {
 		dNew := squareDistanceWithBailout(features, row.Features, d)

 		if dNew < d {
 			label = row.Label
 			d = dNew
 		}
 	}

 	return
 }

 var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")

 func main() {
 	flag.Parse()
 	if *cpuprofile != "" {
 		f, err := os.Create(*cpuprofile)
 		if err != nil {
 			log.Fatal(err)
 		}
 		pprof.StartCPUProfile(f)
 		defer pprof.StopCPUProfile()
 	}
 	runtime.GOMAXPROCS(runtime.NumCPU())

 	println("parsing test")
 	validationSample := parseCSVFile("many_features_test.csv")
 	println("parsed test")

 	var totalCorrect float64 = 0
 	successChannel := make(chan float64, 10000) //len(validationSample))

 	for i := 0; i < 10000; i++ {
 		test := validationSample[i]
 		go func(t LabelWithFeatures, j int) {
 			if j%5 == 0 {
 				println("classifying", j)
 			}
 			if string(t.Label) == string(classify(t.Features)) {
 				successChannel <- 1
 			} else {
 				successChannel <- 0
 			}
 			if j%5 == 0 {
 				println("classified", j)
 			}
 		}(test, i)
 	}

 	for i := 0; i < 10000; i++ { //len(validationSample); i++ {
 		totalCorrect += <-successChannel
 	}

 	fmt.Println(float64(totalCorrect)) // / float64(len(validationSample)))
 }

 // Runs in about 10s on the beefy machine
 // Has some reasonable number of correct, like 98k
diff --git a/scikit-knn.py b/scikit-knn.py
 import numpy
 from sklearn.neighbors import KNeighborsClassifier

 f = open("many_features_test.csv")
 f.readline() # ignore headers
 test = numpy.loadtxt(f, delimiter=',')

 ff = open("many_features_training.csv")
 ff.readline() # ignore headers
 training = numpy.loadtxt(ff, delimiter=',')

 knn = KNeighborsClassifier(n_neighbors=1)
 knn.fit(training[:, [1,128]], numpy.ravel(training[:, [0]]))
 numpy.sum(knn.predict(test[:, [1,128]]) == numpy.ravel(test[:, [0]]))
 # Lightning fast, but...
 # Something shitty, like 534 (out of ~43000)

 knn = KNeighborsClassifier(n_neighbors=1, algorithm='brute')
 knn.fit(training[:, [1,128]], numpy.ravel(training[:, [0]]))
 numpy.sum(knn.predict(test[:, [1,128]]) == numpy.ravel(test[:, [0]]))
 # Takes a while, and then...
 # Something shittier, like MemoryError
	package main

	import (
	"fmt"
	"github.com/amitkgupta/goodlearn/classifier/knn"
	"github.com/amitkgupta/goodlearn/csvparse"
	"github.com/amitkgupta/goodlearn/data/row"
	"runtime"

	"flag"
	"log"
	"os"
	"runtime/pprof"
	)

	var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")

	func main() {
	flag.Parse()
	if *cpuprofile != "" {
	f, err := os.Create(*cpuprofile)
	if err != nil {
	log.Fatal(err)
	}
	pprof.StartCPUProfile(f)
	defer pprof.StopCPUProfile()
	}
	runtime.GOMAXPROCS(runtime.NumCPU())

	println("parsing test")
	validationSample, _ := csvparse.DatasetFromPath("many_features_test.csv", 0, 1)
	println("parsed test")
	println("parsing training")
	trainingSample, _ := csvparse.DatasetFromPath("many_features_training.csv", 0, 1)
	println("parsed training")

	c, _ := knn.NewKNNClassifier(1)
	c.Train(trainingSample)

	var totalCorrect float32 = 0
	successChannel := make(chan float32, 10000) //len(validationSample))

	for i := 0; i < 10000; i++ {
	test, _ := validationSample.Row(i)
	go func(t row.Row, j int) {
	if j%5 == 0 {
	println("classifying", j)
	}
	cl, _ := c.Classify(test)
	if cl.Equals(test.Target()) {
	successChannel <- 1
	} else {
	successChannel <- 0
	}
	if j%5 == 0 {
	println("classified", j)
	}
	}(test, i)
	}

	for i := 0; i < 10000; i++ { //len(validationSample); i++ {
	totalCorrect += <-successChannel
	}

	fmt.Println(float32(totalCorrect)) // / float32(len(validationSample)))
	}

	// Takes about 50s on the beefy machine, with goodlearn optimized
	// - replace math.Pow(x,2) with x*x in distance.go
	// - SliceFromStrings was super slow, but assuming contiguous features and all of them are floats, much faster
	// Outputs unreasonable answer: 10000
	package main

	import (
	"bytes"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"math"
	"os"
	"runtime"
	"runtime/pprof"
	"strconv"
	)

	type LabelWithFeatures struct {
	Label []byte
	Features []float64
	}

	func NewLabelWithFeatures(parsedLine [][]byte) LabelWithFeatures {
	label := parsedLine[0]
	features := make([]float64, len(parsedLine)-1)

	for i, feature := range parsedLine {
	// skip label
	if i == 0 {
	continue
	}

	features[i-1] = byteSliceTofloat64(feature)
	}

	return LabelWithFeatures{label, features}
	}

	var newline = []byte("\n")
	var comma = []byte(",")

	func byteSliceTofloat64(b []byte) float64 {
	x, _ := strconv.ParseFloat(string(b), 32)
	return float64(x)
	}

	func parseCSVFile(filePath string) []LabelWithFeatures {
	fileContent, _ := ioutil.ReadFile(filePath)
	lines := bytes.Split(fileContent, newline)
	numRows := len(lines)

	labelsWithFeatures := make([]LabelWithFeatures, numRows-2)

	for i, line := range lines {
	// skip headers
	if i == 0 \|\| i == numRows-1 {
	continue
	}

	labelsWithFeatures[i-1] = NewLabelWithFeatures(bytes.Split(line, comma))
	}

	return labelsWithFeatures
	}

	func squareDistanceWithBailout(features1, features2 []float64, bailout float64) (d float64) {
	for i := 0; i < len(features1); i++ {
	x := features1[i] - features2[i]
	d += x * x

	if d > bailout {
	break
	}
	}

	return
	}

	var trainingSample = parseCSVFile("many_features_training.csv")

	func classify(features []float64) (label []byte) {
	label = trainingSample[0].Label
	d := squareDistanceWithBailout(features, trainingSample[0].Features, math.MaxFloat32)

	for _, row := range trainingSample {
	dNew := squareDistanceWithBailout(features, row.Features, d)

	if dNew < d {
	label = row.Label
	d = dNew
	}
	}

	return
	}

	var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")

	func main() {
	flag.Parse()
	if *cpuprofile != "" {
	f, err := os.Create(*cpuprofile)
	if err != nil {
	log.Fatal(err)
	}
	pprof.StartCPUProfile(f)
	defer pprof.StopCPUProfile()
	}
	runtime.GOMAXPROCS(runtime.NumCPU())

	println("parsing test")
	validationSample := parseCSVFile("many_features_test.csv")
	println("parsed test")

	var totalCorrect float64 = 0
	successChannel := make(chan float64, 10000) //len(validationSample))

	for i := 0; i < 10000; i++ {
	test := validationSample[i]
	go func(t LabelWithFeatures, j int) {
	if j%5 == 0 {
	println("classifying", j)
	}
	if string(t.Label) == string(classify(t.Features)) {
	successChannel <- 1
	} else {
	successChannel <- 0
	}
	if j%5 == 0 {
	println("classified", j)
	}
	}(test, i)
	}

	for i := 0; i < 10000; i++ { //len(validationSample); i++ {
	totalCorrect += <-successChannel
	}

	fmt.Println(float64(totalCorrect)) // / float64(len(validationSample)))
	}

	// Runs in about 10s on the beefy machine
	// Has some reasonable number of correct, like 98k
	import numpy
	from sklearn.neighbors import KNeighborsClassifier

	f = open("many_features_test.csv")
	f.readline() # ignore headers
	test = numpy.loadtxt(f, delimiter=',')

	ff = open("many_features_training.csv")
	ff.readline() # ignore headers
	training = numpy.loadtxt(ff, delimiter=',')

	knn = KNeighborsClassifier(n_neighbors=1)
	knn.fit(training[:, [1,128]], numpy.ravel(training[:, [0]]))
	numpy.sum(knn.predict(test[:, [1,128]]) == numpy.ravel(test[:, [0]]))
	# Lightning fast, but...
	# Something shitty, like 534 (out of ~43000)

	knn = KNeighborsClassifier(n_neighbors=1, algorithm='brute')
	knn.fit(training[:, [1,128]], numpy.ravel(training[:, [0]]))
	numpy.sum(knn.predict(test[:, [1,128]]) == numpy.ravel(test[:, [0]]))
	# Takes a while, and then...
	# Something shittier, like MemoryError