Skip to content

Instantly share code, notes, and snippets.

@malemi
Last active February 3, 2017 16:35
Show Gist options
  • Save malemi/a8680673f58bb8bff596ac1bd33c2ee1 to your computer and use it in GitHub Desktop.
Save malemi/a8680673f58bb8bff596ac1bd33c2ee1 to your computer and use it in GitHub Desktop.
Tests on document similarity using word2vec and log likelihood ratio. I have used Italian because Italian language models are worse than English ones, and I want to test with something not-optimal.
#python3
import numpy as np
from itertools import combinations_with_replacement
from pyemd import emd
from scipy import spatial
import gensim as gs
synzero = np.load('data/it/wiki_iter.m.syn0.npy')
synoneneg = np.load('data/it/wiki_iter.m.syn1neg.npy')
table = np.load('data/it/wiki_iter.m.table.npy')
# load the model
wiki_iter = gs.models.Word2Vec.load('data/it/wiki_iter.m')
# wiki_iter = np.load("data/it/wiki_iter.m")
wiki_iter.syn0 = synzero
# range of all dimensions
mindim = np.zeros(300)
maxdim = np.zeros(300)
for v in synzero:
mindim = [min(i,j) for i, j in zip(mindim, v)]
maxdim = [max(i,j) for i, j in zip(maxdim, v)]
# average mins and maxs
np.average(maxdim)
np.average(mindim)
# average values of dimensions
avedim = np.zeros(300)
for v in synzero:
avedim += v
avedim /= len(synzero)
avedim
min(avedim)
max(avedim)
# Norms of all words
for v in synzero:
minnorm = min(np.linalg.norm(v), minnorm)
maxnorm = max(np.linalg.norm(v), maxnorm)
avenorm += np.linalg.norm(v)
avenorm /= len(synzero)
# Distance btw "il/un PM italiano" e "renzi/andreotti"
np.linalg.norm((wiki_iter['il'] + wiki_iter['primo'] + wiki_iter['ministro'] + wiki_iter['italiano'] )/4. - (wiki_iter['renzi']))
np.linalg.norm((wiki_iter['il'] + wiki_iter['primo'] + wiki_iter['ministro'] + wiki_iter['italiano'] )/4. - (wiki_iter['andreotti']))
np.linalg.norm((wiki_iter['un'] + wiki_iter['primo'] + wiki_iter['ministro'] + wiki_iter['italiano'] )/4. - (wiki_iter['renzi']))
np.linalg.norm((wiki_iter['un'] + wiki_iter['primo'] + wiki_iter['ministro'] + wiki_iter['italiano'] )/4. - (wiki_iter['andreotti']))
# ^ conclusion: il/un does not help
# two examples to be sure averaging the vector of "significative n-grams" work
# distance btw "schienale" with "sgabello"....
np.linalg.norm((wiki_iter['sgabello'] + wiki_iter['schienale'])/2. - wiki_iter['sedia'])
# is similar to distance btw two actual synonyms:
np.linalg.norm(wiki_iter['calcolatore'] - wiki_iter['computer'])
# Now, three sentences. First and second ones have same meaning.
# The third one has similar words to the second one but completely different meaning.
# (they are stored as lists of tokens, w/o stopwords)
frase1 = ['renzi', 'parla', 'milano'] # "renzi parla a milano"
frase2 = ['presidente', 'consiglio', 'italiano', 'tiene', 'discorso', 'capoluogo', 'lombardo'] # "il presidente del consiglio tiene un discorso nel capoluogo lombardo"
frase3 = ['consiglio', 'tenere', 'occupato', 'cervello'] # "consiglio di tenere occupato il cervello"
# first, some tests:
np.linalg.norm(wiki_iter['occupato'] - wiki_iter['discorso'])
np.linalg.norm(wiki_iter['occupato'] - wiki_iter['capoluogo'])
np.linalg.norm(wiki_iter['occupato'] - wiki_iter['lombardo'])
np.linalg.norm(wiki_iter['cervello'] - wiki_iter['discorso'])
np.linalg.norm(wiki_iter['cervello'] - wiki_iter['capoluogo'])
np.linalg.norm(wiki_iter['cervello'] - wiki_iter['lombardo'])
# Offline we discovered that "presidente consiglio italiano", "tiene discorso" and "capoluogo lombardo" have a very high LLR, and they must be considered as a single entity.
# Therefore frase2 becomes [average("presidente consiglio italiano"), average("tiene discorso"), average("capoluogo lombardo")]
# Roughly following "From Word Embeddings To Document Distances" (we should use the Wasserstein distance, but in this case we have 3-word sentences, and each word in a sentence has a minimum distance to the positionally correspondent one in the other sentence, so it is easily done), we measure the distance btw frase1 and frase2:
llr_frase1 = [wiki_iter['renzi'], wiki_iter['parla'], wiki_iter['milano']]
llr_frase2 = [(wiki_iter['presidente'] + wiki_iter['consiglio'] + wiki_iter['italiano'])/3., (wiki_iter['tiene'] + wiki_iter['discorso'])/2., (wiki_iter['capoluogo'] + wiki_iter['lombardo'])/2.]
llr_frase3 = [wiki_iter['consiglio'], (wiki_iter['tenere'] + wiki_iter['occupato'])/2., wiki_iter['cervello']]
smart12 = np.linalg.norm((wiki_iter['presidente'] + wiki_iter['consiglio'] + wiki_iter['italiano'])/3. - \
wiki_iter['renzi']) + \
np.linalg.norm((wiki_iter['tiene'] + wiki_iter['discorso'])/2. - wiki_iter['parla']) + \
np.linalg.norm((wiki_iter['capoluogo'] + wiki_iter['lombardo'])/2. - \
wiki_iter['milano'])
smart13 = np.linalg.norm(wiki_iter['renzi'] - wiki_iter['consiglio']) + \
np.linalg.norm(wiki_iter['parla'] - (wiki_iter['tenere'] + wiki_iter['occupato'])/2.) + \
np.linalg.norm(wiki_iter['milano'] - wiki_iter['cervello'])
smart23 = np.linalg.norm((wiki_iter['presidente'] + wiki_iter['consiglio'] + wiki_iter['italiano'])/3. - \
wiki_iter['consiglio']) + \
np.linalg.norm((wiki_iter['tenere'] + wiki_iter['occupato'])/2. - \
(wiki_iter['tiene'] + wiki_iter['discorso'])/2.) + \
np.linalg.norm(wiki_iter['cervello'] - (wiki_iter['capoluogo'] + wiki_iter['lombardo'])/2.)
def total(frase: list, normalize: bool, words: bool=True) -> np.array:
'''words True: ["renzi", "parla"]
False: [wiki_iter["renzi"], wiki_iter["parla"]]
'''
if words:
frase_array = sum(np.array([wiki_iter[w].astype(np.float64) for w in frase]))
else:
frase_array = sum(np.array([w.astype(np.float64) for w in frase]))
if normalize:
return frase_array / np.linalg.norm(frase_array)
else:
return frase_array
def make_distance_matrix(dim = 300):
distance_matrix = np.array([np.zeros(dim)]*dim)
v = [x for x in combinations_with_replacement(range(0,dim), 2)]
zero_ones = [int(i) for i in map(lambda x: x[0] != x[1], v)]
for i, d in enumerate(v):
distance_matrix[d[0]][d[1]] = np.float64(zero_ones[i])
distance_matrix[d[1]][d[0]] = np.float64(zero_ones[i])
return distance_matrix
distance_matrix = make_distance_matrix(300)
nf1 = total(frase1, normalize=True)
nf2 = total(frase2, normalize=True)
nf3 = total(frase3, normalize=True)
f1 = total(frase1, normalize=False)
f2 = total(frase2, normalize=False)
f3 = total(frase3, normalize=False)
nllr1 = total(llr_frase1, normalize=True, words=False)
nllr2 = total(llr_frase2, normalize=True, words=False)
nllr3 = total(llr_frase3, normalize=True, words=False)
llr1 = total(llr_frase1, normalize=False, words=False)
llr2 = total(llr_frase2, normalize=False, words=False)
llr3 = total(llr_frase3, normalize=False, words=False)
print("1-2 LLR distance:", smart12)
print("1-3 LLR distance:", smart13)
print("2-3 LLR distance:", smart23)
print("1-2 EMD normalized vec sum", emd(nf1, nf2, distance_matrix))
print("1-3 EMD normalized vec sum", emd(nf1, nf3, distance_matrix))
print("2-3 EMD normalized vec sum", emd(nf2, nf3, distance_matrix))
print("1-2 EMD vec sum", emd(f1, f2, distance_matrix))
print("1-3 EMD vec sum", emd(f1, f3, distance_matrix))
print("2-3 EMD vec sum", emd(f2, f3, distance_matrix))
print("1-2 EMD LLR, normalized sum", emd(nllr1, nllr2, distance_matrix))
print("1-3 EMD LLR, normalized sum", emd(nllr1, nllr3, distance_matrix))
print("2-3 EMD LLR, normalized sum", emd(nllr2, nllr3, distance_matrix))
print("1-2 EMD LLR, vec sum", emd(llr1, llr2, distance_matrix))
print("1-3 EMD LLR, vec sum", emd(llr1, llr3, distance_matrix))
print("2-3 EMD LLR, vec sum", emd(llr2, llr3, distance_matrix))
print("1-2 COSINE normalized vec sum", spatial.distance.cosine(nf1, nf2))
print("1-3 COSINE normalized vec sum", spatial.distance.cosine(nf1, nf3))
print("2-3 COSINE normalized vec sum", spatial.distance.cosine(nf2, nf3))
print("1-2 COSINE vec sum", spatial.distance.cosine(f1, f2))
print("1-3 COSINE vec sum", spatial.distance.cosine(f1, f3))
print("2-3 COSINE vec sum", spatial.distance.cosine(f2, f3))
print("1-2 COSINE LLR, normalized sum", spatial.distance.cosine(nllr1, nllr2))
print("1-3 COSINE LLR, normalized sum", spatial.distance.cosine(nllr1, nllr3))
print("2-3 COSINE LLR, normalized sum", spatial.distance.cosine(nllr2, nllr3))
print("1-2 COSINE LLR, vec sum", spatial.distance.cosine(llr1, llr2))
print("1-3 COSINE LLR, vec sum", spatial.distance.cosine(llr1, llr3))
print("2-3 COSINE LLR, vec sum", spatial.distance.cosine(llr2, llr3))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment