Last active
January 4, 2016 23:51
-
-
Save bcleenders/f450621e3f8d828ef755 to your computer and use it in GitHub Desktop.
Multi model translation & benchmarking
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Translate using a multiple models and a translation matrix | |
# Uses a translation matrix to convert the word-vector from one language to the other | |
# i.e. vec("koning")*matrix = vec("king") | |
import sys | |
import getopt | |
import collections | |
import random | |
import numpy as np | |
from utils import read_dict, train_tm, apply_tm, score, get_valid_data | |
from tsne import bh_sne | |
import csv | |
import codecs | |
import unicodedata | |
import re | |
import pandas as pd | |
from gensim.models import Word2Vec | |
# If you want to run with two seperate models: | |
#model_nl = Word2Vec.load_word2vec_format('models/nlwiki_lowercase.model', binary=False) # C text format | |
#model_en = Word2Vec.load_word2vec_format('models/enwiki_lowercase.model', binary=False, encoding='latin1') # C text format | |
# We also did it with different test sets | |
#model_nl = Word2Vec.load_word2vec_format('models/nlwiki_lowercase_400.model', binary=False) # C text format | |
# model_en = Word2Vec.load_word2vec_format('models/enwiki_lowercase_400.model', binary=False, encoding='latin1') # C text format | |
# If you want to run with a single model that's used for both input and output | |
model_both = Word2Vec.load_word2vec_format('models/bothwiki_lowercase_400.model', binary=False, encoding='latin1') # C text format | |
#model_both = Word2Vec.load_word2vec_format('models/bothwiki_400.model', binary=False, encoding='latin1') # C text format | |
# Instead of loading two separate models, store this model in both variables | |
model_nl = model_both | |
model_en = model_both | |
# Find the words closest to the transformed vector | |
def top_translations(w, translation_matrix, topn=5): | |
vec = model_nl[w].dot(translation_matrix) | |
return [i[0] for i in model_en.most_similar([vec], topn=topn)] | |
def get_rank(nn, gold): | |
for idx, word in enumerate(nn): | |
if word in gold: | |
return idx + 1 | |
return idx + 1 | |
def strip_accents(s): | |
return ''.join(c for c in unicodedata.normalize('NFD', unicode(s, "utf-8")) | |
if unicodedata.category(c) != 'Mn') | |
def clean(dirty): | |
# Make alphanumeric characters simpler | |
clean = strip_accents(dirty) | |
# Remove all remaining weird characters | |
return re.sub('[\W_]+', '', clean) | |
# print out source word and translation | |
def get_data(): | |
# A map that contains an array of possible translations ('target's) for a source word | |
translations = {} | |
# Read the training data file | |
train_data = read_dict('dutch2.txt') | |
for source, target in train_data: | |
# Only process if the models know these words | |
if source in model_nl and target in model_en and not ' ' in source and not ' ' in target: | |
if source in translations: | |
translations[source].append(target) | |
else: | |
translations[source] = [target] | |
# Read the CSV file | |
word_pairs = codecs.open('did_it_work.csv', 'r', 'utf-8') | |
dic2 = pd.read_csv(word_pairs, sep=';', header=None, names=['en', 'nl', 'drop', 'drop', 'drop', 'drop', 'drop'])[['en', 'nl']] | |
for n in range(len(dic2['nl'])): | |
source = dic2['nl'][n] | |
target = dic2['en'][n] | |
if source in model_nl and target in model_en and not ' ' in source and not ' ' in target: | |
if source in translations: | |
translations[source].append(target) | |
else: | |
translations[source] = [target] | |
# Now transform it from a dict to an array | |
return [(source, translations[source]) for source in translations] | |
def get_translation_matrix(translations, printing=True): | |
matrix_train_source = [] | |
matrix_train_target = [] | |
for (source, targets) in translations: | |
for target in targets: | |
matrix_train_source.append(model_nl[source]) | |
matrix_train_target.append(model_en[target]) | |
# Matrix W is given in http://stackoverflow.com/questions/27980159/fit-a-linear-transformation-in-python | |
# translation_matrix = np.linalg.pinv(matrix_train_source).dot(matrix_train_target).T | |
translation_matrix = np.linalg.lstsq(matrix_train_source, matrix_train_target, -1)[0] | |
return translation_matrix | |
def benchmark(translations, translation_matrix, topn, printing=True): | |
correct = np.zeros(len(topn)) | |
test = 0 | |
topn.sort() | |
for (source, targets) in translations: | |
test = test + 1 | |
answers = top_translations(source, translation_matrix, max(topn)) # check for largest element in the list | |
# Now check at what position the best translation is | |
# (For each target in targets, check what position it's in in the answers, and take the one with the lowest index) | |
best_position = min([(answers.index(target) if target in answers else max(topn) + 1) for target in targets]) | |
for i, n in enumerate(topn): | |
if best_position <= n: | |
correct[i] = correct[i] + 1 | |
if printing: | |
print "Correct translations: %s" % targets | |
print "Answered by algorithm: %s" % answers | |
print "Best translation at position: %s" % best_position | |
accuracy = [c / test for c in correct] | |
return accuracy | |
def display_translations(training_size=1500, topn=[1, 5, 10], printing=True): | |
# Get an array of inputs with their | |
translations = get_data() | |
if printing: | |
print "Dataset size: %s" % len(translations) | |
# Let's shuffle the pairs, so we don't always have the same training/test set | |
random.shuffle(translations) | |
if training_size > len(translations): | |
print 'Training size cannot be larger than total dataset size (out of bounds exception)' | |
exit() | |
training_set = translations[:training_size] | |
test_set = translations[training_size:] | |
matrix = get_translation_matrix(training_set, printing) | |
# print "Finished computing translation matrix" | |
return benchmark(test_set, matrix, topn, printing) | |
accuracies = {} | |
for i in range(500, 3000, 100): | |
for n in xrange(10): | |
accuracies[i] = display_translations(training_size=i, printing=False) | |
print "Training Size: %s, Accuracy @1, 5, 10: %s" % (i, accuracies[i]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment