Skip to content

Instantly share code, notes, and snippets.

@bobvanluijt
Last active May 23, 2018 13:55
Show Gist options
  • Save bobvanluijt/e62fc208c31d4ac894e9a19e2e09f72c to your computer and use it in GitHub Desktop.
Save bobvanluijt/e62fc208c31d4ac894e9a19e2e09f72c to your computer and use it in GitHub Desktop.
VECTOR DRAFT IN GO
package main
import (
"bufio"
"bytes"
"compress/gzip"
"errors"
"fmt"
"io/ioutil"
"log"
"math"
"os"
"sort"
"strconv"
"strings"
"time"
"github.com/2tvenom/cbor"
"github.com/arbovm/levenshtein"
uuid "github.com/satori/go.uuid"
)
type Vectors map[string][]float64
type MapOfSimilarity map[float64]string
func LoadVectors(inputFile string) Vectors {
vectors := Vectors{}
// load the Glove TXT file
file, err := os.Open(inputFile)
if err != nil {
log.Fatal(err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
if err := scanner.Err(); err != nil {
log.Fatal(err)
}
for scanner.Scan() {
stringSlice := strings.Split(scanner.Text(), " ")
// length is complete slice minus 1
for i := 1; i < (len(stringSlice) - 1); i++ {
// parse the string to a float
float, _ := strconv.ParseFloat(stringSlice[i], 64)
// add the float to the vector
vectors[stringSlice[0]] = append(vectors[stringSlice[0]], float)
//vectors[stringSlice[0]][i] = float
}
}
return vectors
}
func cosineDistance(i1 []float64, i2 []float64) (cosine float64, err error) {
count := 0
length_a := len(i1)
length_b := len(i2)
if length_a > length_b {
count = length_a
} else {
count = length_b
}
sumA := 0.0
s1 := 0.0
s2 := 0.0
for k := 0; k < count; k++ {
if k >= length_a {
s2 += math.Pow(i2[k], 2)
continue
}
if k >= length_b {
s1 += math.Pow(i1[k], 2)
continue
}
sumA += i1[k] * i2[k]
s1 += math.Pow(i1[k], 2)
s2 += math.Pow(i2[k], 2)
}
if s1 == 0 || s2 == 0 {
return 0.0, errors.New("Vectors as 0 does not work")
}
return sumA / (math.Sqrt(s1) * math.Sqrt(s2)), nil
}
func findSimilarWordInVectorSpace(word string, minDistance int, vectors Vectors) string {
for vectorWord := range vectors {
currentDist := levenshtein.Distance(word, vectorWord)
if currentDist <= minDistance {
return vectorWord
}
}
// nothing found
return ""
}
func CalcSimilarity(word1 string, word2 string, vectors Vectors) (float64, error) {
if _, ok := vectors[word1]; !ok {
word1 = findSimilarWordInVectorSpace(word1, 1, vectors)
if word1 == "" {
return 0, errors.New("First word is not available")
}
}
if _, ok := vectors[word2]; !ok {
word2 = findSimilarWordInVectorSpace(word2, 1, vectors)
if word2 == "" {
return 0, errors.New("Second word is not available")
}
}
return cosineDistance(vectors[word1], vectors[word2])
}
func MostSimilar(word string, vectors Vectors) [100]string {
//similarity := 0.0
//mostSimilarWord := ""
mapOfSimilarity := map[float64]string{}
// get all similaratis in a map
for vectorWord := range vectors {
result, _ := CalcSimilarity(word, vectorWord, vectors)
mapOfSimilarity[result] = vectorWord
}
// sort all the similaritys
sims := make([]float64, 0, len(mapOfSimilarity))
for sim := range mapOfSimilarity {
sims = append(sims, sim)
}
sort.Sort(sort.Reverse(sort.Float64Slice(sims)))
// get the first 100 most similar words
var returner [100]string
for i := 0; i < 99; i++ {
returner[i] = mapOfSimilarity[sims[i]]
}
return returner
}
func centroid(words []string, vectors Vectors) (string, Vectors) {
wordsAndVectors := Vectors{}
for i := 0; i < len(words); i++ {
wordsAndVectors[words[i]] = vectors[words[i]]
}
var newVectors []float64
// loops through vector length (length of float of the first vector)
for i := 0; i < len(wordsAndVectors[words[0]]); i++ {
var rangeVector []float64
for i2 := 0; i2 < len(words); i2++ {
rangeVector = append(rangeVector, wordsAndVectors[words[i2]][i])
}
newVector := 0.0
for _, sumVector := range rangeVector {
newVector += sumVector
}
newVectors = append(newVectors, newVector/float64(len(rangeVector)))
}
wordUuid, _ := uuid.NewV4()
vectors[wordUuid.String()] = newVectors
return wordUuid.String(), vectors
}
func ReadGzFile(filename string) ([]byte, error) {
fi, err := os.Open(filename)
if err != nil {
return nil, err
}
defer fi.Close()
fz, err := gzip.NewReader(fi)
if err != nil {
return nil, err
}
defer fz.Close()
s, err := ioutil.ReadAll(fz)
if err != nil {
return nil, err
}
return s, nil
}
func main() {
mainWord1 := "apple"
mainWord2 := "fruit"
start := time.Now()
fmt.Println("LOAD VECTORS")
vectors := Vectors{}
var buffTest bytes.Buffer
encoder := cbor.NewEncoder(&buffTest)
b, err := ReadGzFile("../vectors.cbor.gz")
if err != nil {
fmt.Println("O NO!", err)
}
ok, err := encoder.Unmarshal(b, &vectors)
if !ok {
fmt.Printf("Error Unmarshal %s", err)
return
}
fmt.Println("DONE LOADING", time.Since(start))
fmt.Println("-----")
start = time.Now()
fmt.Println("LOAD SIMILAR")
fmt.Println("FIND SIMILAR TO ", mainWord1, " and ", mainWord2)
fmt.Println(CalcSimilarity(mainWord1, mainWord2, vectors))
fmt.Println(time.Since(start))
fmt.Println("-----")
start = time.Now()
fmt.Println("MOST SIMILAR TO "+mainWord1+" IS: ", MostSimilar(mainWord1, vectors), time.Since(start))
fmt.Println("-----")
start = time.Now()
fmt.Println("LOAD CENTROID for: ", "fruit, apple")
wordsToCentroid := make([]string, 2)
wordsToCentroid[0] = mainWord1
wordsToCentroid[1] = mainWord2
newlyCreatedVectorId, vectors := centroid(wordsToCentroid, vectors)
fmt.Println(newlyCreatedVectorId, vectors[newlyCreatedVectorId])
fmt.Println("CENTROID DONE", time.Since(start))
fmt.Println("-----")
start = time.Now()
fmt.Println("FIND NEW MOST SIMILAR TO "+newlyCreatedVectorId+" IS: ", MostSimilar(newlyCreatedVectorId, vectors), time.Since(start))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment