Skip to content

Instantly share code, notes, and snippets.

@bcleenders
Last active January 4, 2016 23:51
Show Gist options
  • Save bcleenders/f450621e3f8d828ef755 to your computer and use it in GitHub Desktop.
Save bcleenders/f450621e3f8d828ef755 to your computer and use it in GitHub Desktop.
Multi model translation & benchmarking
# Translate using a multiple models and a translation matrix
# Uses a translation matrix to convert the word-vector from one language to the other
# i.e. vec("koning")*matrix = vec("king")
import sys
import getopt
import collections
import random
import numpy as np
from utils import read_dict, train_tm, apply_tm, score, get_valid_data
from tsne import bh_sne
import csv
import codecs
import unicodedata
import re
import pandas as pd
from gensim.models import Word2Vec
# If you want to run with two seperate models:
#model_nl = Word2Vec.load_word2vec_format('models/nlwiki_lowercase.model', binary=False) # C text format
#model_en = Word2Vec.load_word2vec_format('models/enwiki_lowercase.model', binary=False, encoding='latin1') # C text format
# We also did it with different test sets
#model_nl = Word2Vec.load_word2vec_format('models/nlwiki_lowercase_400.model', binary=False) # C text format
# model_en = Word2Vec.load_word2vec_format('models/enwiki_lowercase_400.model', binary=False, encoding='latin1') # C text format
# If you want to run with a single model that's used for both input and output
model_both = Word2Vec.load_word2vec_format('models/bothwiki_lowercase_400.model', binary=False, encoding='latin1') # C text format
#model_both = Word2Vec.load_word2vec_format('models/bothwiki_400.model', binary=False, encoding='latin1') # C text format
# Instead of loading two separate models, store this model in both variables
model_nl = model_both
model_en = model_both
# Find the words closest to the transformed vector
def top_translations(w, translation_matrix, topn=5):
vec = model_nl[w].dot(translation_matrix)
return [i[0] for i in model_en.most_similar([vec], topn=topn)]
def get_rank(nn, gold):
for idx, word in enumerate(nn):
if word in gold:
return idx + 1
return idx + 1
def strip_accents(s):
return ''.join(c for c in unicodedata.normalize('NFD', unicode(s, "utf-8"))
if unicodedata.category(c) != 'Mn')
def clean(dirty):
# Make alphanumeric characters simpler
clean = strip_accents(dirty)
# Remove all remaining weird characters
return re.sub('[\W_]+', '', clean)
# print out source word and translation
def get_data():
# A map that contains an array of possible translations ('target's) for a source word
translations = {}
# Read the training data file
train_data = read_dict('dutch2.txt')
for source, target in train_data:
# Only process if the models know these words
if source in model_nl and target in model_en and not ' ' in source and not ' ' in target:
if source in translations:
translations[source].append(target)
else:
translations[source] = [target]
# Read the CSV file
word_pairs = codecs.open('did_it_work.csv', 'r', 'utf-8')
dic2 = pd.read_csv(word_pairs, sep=';', header=None, names=['en', 'nl', 'drop', 'drop', 'drop', 'drop', 'drop'])[['en', 'nl']]
for n in range(len(dic2['nl'])):
source = dic2['nl'][n]
target = dic2['en'][n]
if source in model_nl and target in model_en and not ' ' in source and not ' ' in target:
if source in translations:
translations[source].append(target)
else:
translations[source] = [target]
# Now transform it from a dict to an array
return [(source, translations[source]) for source in translations]
def get_translation_matrix(translations, printing=True):
matrix_train_source = []
matrix_train_target = []
for (source, targets) in translations:
for target in targets:
matrix_train_source.append(model_nl[source])
matrix_train_target.append(model_en[target])
# Matrix W is given in http://stackoverflow.com/questions/27980159/fit-a-linear-transformation-in-python
# translation_matrix = np.linalg.pinv(matrix_train_source).dot(matrix_train_target).T
translation_matrix = np.linalg.lstsq(matrix_train_source, matrix_train_target, -1)[0]
return translation_matrix
def benchmark(translations, translation_matrix, topn, printing=True):
correct = np.zeros(len(topn))
test = 0
topn.sort()
for (source, targets) in translations:
test = test + 1
answers = top_translations(source, translation_matrix, max(topn)) # check for largest element in the list
# Now check at what position the best translation is
# (For each target in targets, check what position it's in in the answers, and take the one with the lowest index)
best_position = min([(answers.index(target) if target in answers else max(topn) + 1) for target in targets])
for i, n in enumerate(topn):
if best_position <= n:
correct[i] = correct[i] + 1
if printing:
print "Correct translations: %s" % targets
print "Answered by algorithm: %s" % answers
print "Best translation at position: %s" % best_position
accuracy = [c / test for c in correct]
return accuracy
def display_translations(training_size=1500, topn=[1, 5, 10], printing=True):
# Get an array of inputs with their
translations = get_data()
if printing:
print "Dataset size: %s" % len(translations)
# Let's shuffle the pairs, so we don't always have the same training/test set
random.shuffle(translations)
if training_size > len(translations):
print 'Training size cannot be larger than total dataset size (out of bounds exception)'
exit()
training_set = translations[:training_size]
test_set = translations[training_size:]
matrix = get_translation_matrix(training_set, printing)
# print "Finished computing translation matrix"
return benchmark(test_set, matrix, topn, printing)
accuracies = {}
for i in range(500, 3000, 100):
for n in xrange(10):
accuracies[i] = display_translations(training_size=i, printing=False)
print "Training Size: %s, Accuracy @1, 5, 10: %s" % (i, accuracies[i])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment