Last active
February 3, 2017 16:35
-
-
Save malemi/a8680673f58bb8bff596ac1bd33c2ee1 to your computer and use it in GitHub Desktop.
Tests on document similarity using word2vec and log likelihood ratio. I have used Italian because Italian language models are worse than English ones, and I want to test with something not-optimal.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#python3 | |
import numpy as np | |
from itertools import combinations_with_replacement | |
from pyemd import emd | |
from scipy import spatial | |
import gensim as gs | |
synzero = np.load('data/it/wiki_iter.m.syn0.npy') | |
synoneneg = np.load('data/it/wiki_iter.m.syn1neg.npy') | |
table = np.load('data/it/wiki_iter.m.table.npy') | |
# load the model | |
wiki_iter = gs.models.Word2Vec.load('data/it/wiki_iter.m') | |
# wiki_iter = np.load("data/it/wiki_iter.m") | |
wiki_iter.syn0 = synzero | |
# range of all dimensions | |
mindim = np.zeros(300) | |
maxdim = np.zeros(300) | |
for v in synzero: | |
mindim = [min(i,j) for i, j in zip(mindim, v)] | |
maxdim = [max(i,j) for i, j in zip(maxdim, v)] | |
# average mins and maxs | |
np.average(maxdim) | |
np.average(mindim) | |
# average values of dimensions | |
avedim = np.zeros(300) | |
for v in synzero: | |
avedim += v | |
avedim /= len(synzero) | |
avedim | |
min(avedim) | |
max(avedim) | |
# Norms of all words | |
for v in synzero: | |
minnorm = min(np.linalg.norm(v), minnorm) | |
maxnorm = max(np.linalg.norm(v), maxnorm) | |
avenorm += np.linalg.norm(v) | |
avenorm /= len(synzero) | |
# Distance btw "il/un PM italiano" e "renzi/andreotti" | |
np.linalg.norm((wiki_iter['il'] + wiki_iter['primo'] + wiki_iter['ministro'] + wiki_iter['italiano'] )/4. - (wiki_iter['renzi'])) | |
np.linalg.norm((wiki_iter['il'] + wiki_iter['primo'] + wiki_iter['ministro'] + wiki_iter['italiano'] )/4. - (wiki_iter['andreotti'])) | |
np.linalg.norm((wiki_iter['un'] + wiki_iter['primo'] + wiki_iter['ministro'] + wiki_iter['italiano'] )/4. - (wiki_iter['renzi'])) | |
np.linalg.norm((wiki_iter['un'] + wiki_iter['primo'] + wiki_iter['ministro'] + wiki_iter['italiano'] )/4. - (wiki_iter['andreotti'])) | |
# ^ conclusion: il/un does not help | |
# two examples to be sure averaging the vector of "significative n-grams" work | |
# distance btw "schienale" with "sgabello".... | |
np.linalg.norm((wiki_iter['sgabello'] + wiki_iter['schienale'])/2. - wiki_iter['sedia']) | |
# is similar to distance btw two actual synonyms: | |
np.linalg.norm(wiki_iter['calcolatore'] - wiki_iter['computer']) | |
# Now, three sentences. First and second ones have same meaning. | |
# The third one has similar words to the second one but completely different meaning. | |
# (they are stored as lists of tokens, w/o stopwords) | |
frase1 = ['renzi', 'parla', 'milano'] # "renzi parla a milano" | |
frase2 = ['presidente', 'consiglio', 'italiano', 'tiene', 'discorso', 'capoluogo', 'lombardo'] # "il presidente del consiglio tiene un discorso nel capoluogo lombardo" | |
frase3 = ['consiglio', 'tenere', 'occupato', 'cervello'] # "consiglio di tenere occupato il cervello" | |
# first, some tests: | |
np.linalg.norm(wiki_iter['occupato'] - wiki_iter['discorso']) | |
np.linalg.norm(wiki_iter['occupato'] - wiki_iter['capoluogo']) | |
np.linalg.norm(wiki_iter['occupato'] - wiki_iter['lombardo']) | |
np.linalg.norm(wiki_iter['cervello'] - wiki_iter['discorso']) | |
np.linalg.norm(wiki_iter['cervello'] - wiki_iter['capoluogo']) | |
np.linalg.norm(wiki_iter['cervello'] - wiki_iter['lombardo']) | |
# Offline we discovered that "presidente consiglio italiano", "tiene discorso" and "capoluogo lombardo" have a very high LLR, and they must be considered as a single entity. | |
# Therefore frase2 becomes [average("presidente consiglio italiano"), average("tiene discorso"), average("capoluogo lombardo")] | |
# Roughly following "From Word Embeddings To Document Distances" (we should use the Wasserstein distance, but in this case we have 3-word sentences, and each word in a sentence has a minimum distance to the positionally correspondent one in the other sentence, so it is easily done), we measure the distance btw frase1 and frase2: | |
llr_frase1 = [wiki_iter['renzi'], wiki_iter['parla'], wiki_iter['milano']] | |
llr_frase2 = [(wiki_iter['presidente'] + wiki_iter['consiglio'] + wiki_iter['italiano'])/3., (wiki_iter['tiene'] + wiki_iter['discorso'])/2., (wiki_iter['capoluogo'] + wiki_iter['lombardo'])/2.] | |
llr_frase3 = [wiki_iter['consiglio'], (wiki_iter['tenere'] + wiki_iter['occupato'])/2., wiki_iter['cervello']] | |
smart12 = np.linalg.norm((wiki_iter['presidente'] + wiki_iter['consiglio'] + wiki_iter['italiano'])/3. - \ | |
wiki_iter['renzi']) + \ | |
np.linalg.norm((wiki_iter['tiene'] + wiki_iter['discorso'])/2. - wiki_iter['parla']) + \ | |
np.linalg.norm((wiki_iter['capoluogo'] + wiki_iter['lombardo'])/2. - \ | |
wiki_iter['milano']) | |
smart13 = np.linalg.norm(wiki_iter['renzi'] - wiki_iter['consiglio']) + \ | |
np.linalg.norm(wiki_iter['parla'] - (wiki_iter['tenere'] + wiki_iter['occupato'])/2.) + \ | |
np.linalg.norm(wiki_iter['milano'] - wiki_iter['cervello']) | |
smart23 = np.linalg.norm((wiki_iter['presidente'] + wiki_iter['consiglio'] + wiki_iter['italiano'])/3. - \ | |
wiki_iter['consiglio']) + \ | |
np.linalg.norm((wiki_iter['tenere'] + wiki_iter['occupato'])/2. - \ | |
(wiki_iter['tiene'] + wiki_iter['discorso'])/2.) + \ | |
np.linalg.norm(wiki_iter['cervello'] - (wiki_iter['capoluogo'] + wiki_iter['lombardo'])/2.) | |
def total(frase: list, normalize: bool, words: bool=True) -> np.array: | |
'''words True: ["renzi", "parla"] | |
False: [wiki_iter["renzi"], wiki_iter["parla"]] | |
''' | |
if words: | |
frase_array = sum(np.array([wiki_iter[w].astype(np.float64) for w in frase])) | |
else: | |
frase_array = sum(np.array([w.astype(np.float64) for w in frase])) | |
if normalize: | |
return frase_array / np.linalg.norm(frase_array) | |
else: | |
return frase_array | |
def make_distance_matrix(dim = 300): | |
distance_matrix = np.array([np.zeros(dim)]*dim) | |
v = [x for x in combinations_with_replacement(range(0,dim), 2)] | |
zero_ones = [int(i) for i in map(lambda x: x[0] != x[1], v)] | |
for i, d in enumerate(v): | |
distance_matrix[d[0]][d[1]] = np.float64(zero_ones[i]) | |
distance_matrix[d[1]][d[0]] = np.float64(zero_ones[i]) | |
return distance_matrix | |
distance_matrix = make_distance_matrix(300) | |
nf1 = total(frase1, normalize=True) | |
nf2 = total(frase2, normalize=True) | |
nf3 = total(frase3, normalize=True) | |
f1 = total(frase1, normalize=False) | |
f2 = total(frase2, normalize=False) | |
f3 = total(frase3, normalize=False) | |
nllr1 = total(llr_frase1, normalize=True, words=False) | |
nllr2 = total(llr_frase2, normalize=True, words=False) | |
nllr3 = total(llr_frase3, normalize=True, words=False) | |
llr1 = total(llr_frase1, normalize=False, words=False) | |
llr2 = total(llr_frase2, normalize=False, words=False) | |
llr3 = total(llr_frase3, normalize=False, words=False) | |
print("1-2 LLR distance:", smart12) | |
print("1-3 LLR distance:", smart13) | |
print("2-3 LLR distance:", smart23) | |
print("1-2 EMD normalized vec sum", emd(nf1, nf2, distance_matrix)) | |
print("1-3 EMD normalized vec sum", emd(nf1, nf3, distance_matrix)) | |
print("2-3 EMD normalized vec sum", emd(nf2, nf3, distance_matrix)) | |
print("1-2 EMD vec sum", emd(f1, f2, distance_matrix)) | |
print("1-3 EMD vec sum", emd(f1, f3, distance_matrix)) | |
print("2-3 EMD vec sum", emd(f2, f3, distance_matrix)) | |
print("1-2 EMD LLR, normalized sum", emd(nllr1, nllr2, distance_matrix)) | |
print("1-3 EMD LLR, normalized sum", emd(nllr1, nllr3, distance_matrix)) | |
print("2-3 EMD LLR, normalized sum", emd(nllr2, nllr3, distance_matrix)) | |
print("1-2 EMD LLR, vec sum", emd(llr1, llr2, distance_matrix)) | |
print("1-3 EMD LLR, vec sum", emd(llr1, llr3, distance_matrix)) | |
print("2-3 EMD LLR, vec sum", emd(llr2, llr3, distance_matrix)) | |
print("1-2 COSINE normalized vec sum", spatial.distance.cosine(nf1, nf2)) | |
print("1-3 COSINE normalized vec sum", spatial.distance.cosine(nf1, nf3)) | |
print("2-3 COSINE normalized vec sum", spatial.distance.cosine(nf2, nf3)) | |
print("1-2 COSINE vec sum", spatial.distance.cosine(f1, f2)) | |
print("1-3 COSINE vec sum", spatial.distance.cosine(f1, f3)) | |
print("2-3 COSINE vec sum", spatial.distance.cosine(f2, f3)) | |
print("1-2 COSINE LLR, normalized sum", spatial.distance.cosine(nllr1, nllr2)) | |
print("1-3 COSINE LLR, normalized sum", spatial.distance.cosine(nllr1, nllr3)) | |
print("2-3 COSINE LLR, normalized sum", spatial.distance.cosine(nllr2, nllr3)) | |
print("1-2 COSINE LLR, vec sum", spatial.distance.cosine(llr1, llr2)) | |
print("1-3 COSINE LLR, vec sum", spatial.distance.cosine(llr1, llr3)) | |
print("2-3 COSINE LLR, vec sum", spatial.distance.cosine(llr2, llr3)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment