Last active
May 23, 2018 13:55
-
-
Save bobvanluijt/e62fc208c31d4ac894e9a19e2e09f72c to your computer and use it in GitHub Desktop.
VECTOR DRAFT IN GO
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"bytes" | |
"compress/gzip" | |
"errors" | |
"fmt" | |
"io/ioutil" | |
"log" | |
"math" | |
"os" | |
"sort" | |
"strconv" | |
"strings" | |
"time" | |
"github.com/2tvenom/cbor" | |
"github.com/arbovm/levenshtein" | |
uuid "github.com/satori/go.uuid" | |
) | |
type Vectors map[string][]float64 | |
type MapOfSimilarity map[float64]string | |
func LoadVectors(inputFile string) Vectors { | |
vectors := Vectors{} | |
// load the Glove TXT file | |
file, err := os.Open(inputFile) | |
if err != nil { | |
log.Fatal(err) | |
} | |
defer file.Close() | |
scanner := bufio.NewScanner(file) | |
if err := scanner.Err(); err != nil { | |
log.Fatal(err) | |
} | |
for scanner.Scan() { | |
stringSlice := strings.Split(scanner.Text(), " ") | |
// length is complete slice minus 1 | |
for i := 1; i < (len(stringSlice) - 1); i++ { | |
// parse the string to a float | |
float, _ := strconv.ParseFloat(stringSlice[i], 64) | |
// add the float to the vector | |
vectors[stringSlice[0]] = append(vectors[stringSlice[0]], float) | |
//vectors[stringSlice[0]][i] = float | |
} | |
} | |
return vectors | |
} | |
func cosineDistance(i1 []float64, i2 []float64) (cosine float64, err error) { | |
count := 0 | |
length_a := len(i1) | |
length_b := len(i2) | |
if length_a > length_b { | |
count = length_a | |
} else { | |
count = length_b | |
} | |
sumA := 0.0 | |
s1 := 0.0 | |
s2 := 0.0 | |
for k := 0; k < count; k++ { | |
if k >= length_a { | |
s2 += math.Pow(i2[k], 2) | |
continue | |
} | |
if k >= length_b { | |
s1 += math.Pow(i1[k], 2) | |
continue | |
} | |
sumA += i1[k] * i2[k] | |
s1 += math.Pow(i1[k], 2) | |
s2 += math.Pow(i2[k], 2) | |
} | |
if s1 == 0 || s2 == 0 { | |
return 0.0, errors.New("Vectors as 0 does not work") | |
} | |
return sumA / (math.Sqrt(s1) * math.Sqrt(s2)), nil | |
} | |
func findSimilarWordInVectorSpace(word string, minDistance int, vectors Vectors) string { | |
for vectorWord := range vectors { | |
currentDist := levenshtein.Distance(word, vectorWord) | |
if currentDist <= minDistance { | |
return vectorWord | |
} | |
} | |
// nothing found | |
return "" | |
} | |
func CalcSimilarity(word1 string, word2 string, vectors Vectors) (float64, error) { | |
if _, ok := vectors[word1]; !ok { | |
word1 = findSimilarWordInVectorSpace(word1, 1, vectors) | |
if word1 == "" { | |
return 0, errors.New("First word is not available") | |
} | |
} | |
if _, ok := vectors[word2]; !ok { | |
word2 = findSimilarWordInVectorSpace(word2, 1, vectors) | |
if word2 == "" { | |
return 0, errors.New("Second word is not available") | |
} | |
} | |
return cosineDistance(vectors[word1], vectors[word2]) | |
} | |
func MostSimilar(word string, vectors Vectors) [100]string { | |
//similarity := 0.0 | |
//mostSimilarWord := "" | |
mapOfSimilarity := map[float64]string{} | |
// get all similaratis in a map | |
for vectorWord := range vectors { | |
result, _ := CalcSimilarity(word, vectorWord, vectors) | |
mapOfSimilarity[result] = vectorWord | |
} | |
// sort all the similaritys | |
sims := make([]float64, 0, len(mapOfSimilarity)) | |
for sim := range mapOfSimilarity { | |
sims = append(sims, sim) | |
} | |
sort.Sort(sort.Reverse(sort.Float64Slice(sims))) | |
// get the first 100 most similar words | |
var returner [100]string | |
for i := 0; i < 99; i++ { | |
returner[i] = mapOfSimilarity[sims[i]] | |
} | |
return returner | |
} | |
func centroid(words []string, vectors Vectors) (string, Vectors) { | |
wordsAndVectors := Vectors{} | |
for i := 0; i < len(words); i++ { | |
wordsAndVectors[words[i]] = vectors[words[i]] | |
} | |
var newVectors []float64 | |
// loops through vector length (length of float of the first vector) | |
for i := 0; i < len(wordsAndVectors[words[0]]); i++ { | |
var rangeVector []float64 | |
for i2 := 0; i2 < len(words); i2++ { | |
rangeVector = append(rangeVector, wordsAndVectors[words[i2]][i]) | |
} | |
newVector := 0.0 | |
for _, sumVector := range rangeVector { | |
newVector += sumVector | |
} | |
newVectors = append(newVectors, newVector/float64(len(rangeVector))) | |
} | |
wordUuid, _ := uuid.NewV4() | |
vectors[wordUuid.String()] = newVectors | |
return wordUuid.String(), vectors | |
} | |
func ReadGzFile(filename string) ([]byte, error) { | |
fi, err := os.Open(filename) | |
if err != nil { | |
return nil, err | |
} | |
defer fi.Close() | |
fz, err := gzip.NewReader(fi) | |
if err != nil { | |
return nil, err | |
} | |
defer fz.Close() | |
s, err := ioutil.ReadAll(fz) | |
if err != nil { | |
return nil, err | |
} | |
return s, nil | |
} | |
func main() { | |
mainWord1 := "apple" | |
mainWord2 := "fruit" | |
start := time.Now() | |
fmt.Println("LOAD VECTORS") | |
vectors := Vectors{} | |
var buffTest bytes.Buffer | |
encoder := cbor.NewEncoder(&buffTest) | |
b, err := ReadGzFile("../vectors.cbor.gz") | |
if err != nil { | |
fmt.Println("O NO!", err) | |
} | |
ok, err := encoder.Unmarshal(b, &vectors) | |
if !ok { | |
fmt.Printf("Error Unmarshal %s", err) | |
return | |
} | |
fmt.Println("DONE LOADING", time.Since(start)) | |
fmt.Println("-----") | |
start = time.Now() | |
fmt.Println("LOAD SIMILAR") | |
fmt.Println("FIND SIMILAR TO ", mainWord1, " and ", mainWord2) | |
fmt.Println(CalcSimilarity(mainWord1, mainWord2, vectors)) | |
fmt.Println(time.Since(start)) | |
fmt.Println("-----") | |
start = time.Now() | |
fmt.Println("MOST SIMILAR TO "+mainWord1+" IS: ", MostSimilar(mainWord1, vectors), time.Since(start)) | |
fmt.Println("-----") | |
start = time.Now() | |
fmt.Println("LOAD CENTROID for: ", "fruit, apple") | |
wordsToCentroid := make([]string, 2) | |
wordsToCentroid[0] = mainWord1 | |
wordsToCentroid[1] = mainWord2 | |
newlyCreatedVectorId, vectors := centroid(wordsToCentroid, vectors) | |
fmt.Println(newlyCreatedVectorId, vectors[newlyCreatedVectorId]) | |
fmt.Println("CENTROID DONE", time.Since(start)) | |
fmt.Println("-----") | |
start = time.Now() | |
fmt.Println("FIND NEW MOST SIMILAR TO "+newlyCreatedVectorId+" IS: ", MostSimilar(newlyCreatedVectorId, vectors), time.Since(start)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment