bcleenders · January 4, 2016 23:51
diff --git a/gistfile1.py b/gistfile1.py
 # Translate using a multiple models and a translation matrix
 # Uses a translation matrix to convert the word-vector from one language to the other 
 # i.e. vec("koning")*matrix = vec("king")

 import sys
 import getopt
 import collections
 import random
 import numpy as np
 from utils import read_dict, train_tm, apply_tm, score, get_valid_data
 from tsne import bh_sne
 import csv
 import codecs
 import unicodedata
 import re
 import pandas as pd

 from gensim.models import Word2Vec

 # If you want to run with two seperate models:
 #model_nl = Word2Vec.load_word2vec_format('models/nlwiki_lowercase.model', binary=False)  # C text format
 #model_en = Word2Vec.load_word2vec_format('models/enwiki_lowercase.model', binary=False, encoding='latin1')  # C text format
 # We also did it with different test sets
 #model_nl = Word2Vec.load_word2vec_format('models/nlwiki_lowercase_400.model', binary=False)  # C text format
 # model_en = Word2Vec.load_word2vec_format('models/enwiki_lowercase_400.model', binary=False, encoding='latin1')  # C text format

 # If you want to run with a single model that's used for both input and output
 model_both = Word2Vec.load_word2vec_format('models/bothwiki_lowercase_400.model', binary=False, encoding='latin1')  # C text format
 #model_both = Word2Vec.load_word2vec_format('models/bothwiki_400.model', binary=False, encoding='latin1')  # C text format

 # Instead of loading two separate models, store this model in both variables
 model_nl = model_both
 model_en = model_both


 # Find the words closest to the transformed vector
 def top_translations(w, translation_matrix, topn=5):
    vec = model_nl[w].dot(translation_matrix)
    
    return [i[0] for i in model_en.most_similar([vec], topn=topn)]

 def get_rank(nn, gold):
    for idx, word in enumerate(nn):
        if word in gold:
            return idx + 1
    return idx + 1

 def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', unicode(s, "utf-8"))
                  if unicodedata.category(c) != 'Mn')

 def clean(dirty):
    # Make alphanumeric characters simpler
    clean = strip_accents(dirty)
    # Remove all remaining weird characters
    return re.sub('[\W_]+', '', clean)

 # print out source word and translation
 def get_data():    
    # A map that contains an array of possible translations ('target's) for a source word
    translations = {}

    # Read the training data file
    train_data = read_dict('dutch2.txt')
    for source, target in train_data:
        # Only process if the models know these words
        if source in model_nl and target in model_en and not ' ' in source and not ' ' in target:
            if source in translations:
                translations[source].append(target)
            else:
                translations[source] = [target]

    # Read the CSV file
    word_pairs = codecs.open('did_it_work.csv', 'r', 'utf-8')
    dic2 = pd.read_csv(word_pairs, sep=';', header=None, names=['en', 'nl', 'drop', 'drop', 'drop', 'drop', 'drop'])[['en', 'nl']]

    for n in range(len(dic2['nl'])):
        source = dic2['nl'][n]
        target = dic2['en'][n]

        if source in model_nl and target in model_en and not ' ' in source and not ' ' in target:
            if source in translations:
                translations[source].append(target)
            else:
                translations[source] = [target]

    # Now transform it from a dict to an array
    return [(source, translations[source]) for source in translations]

 def get_translation_matrix(translations, printing=True):
    matrix_train_source = []
    matrix_train_target = []

    for (source, targets) in translations: 
        for target in targets:
            matrix_train_source.append(model_nl[source])
            matrix_train_target.append(model_en[target])

    # Matrix W is given in  http://stackoverflow.com/questions/27980159/fit-a-linear-transformation-in-python
    # translation_matrix = np.linalg.pinv(matrix_train_source).dot(matrix_train_target).T
    translation_matrix = np.linalg.lstsq(matrix_train_source, matrix_train_target, -1)[0]

    return translation_matrix
    

 def benchmark(translations, translation_matrix, topn, printing=True):
    correct = np.zeros(len(topn))
    test = 0
    topn.sort()
    
    for (source, targets) in translations: 
        test = test + 1

        answers = top_translations(source, translation_matrix, max(topn)) # check for largest element in the list
        
        # Now check at what position the best translation is
        # (For each target in targets, check what position it's in in the answers, and take the one with the lowest index)
        best_position = min([(answers.index(target) if target in answers else max(topn) + 1) for target in targets])

        for i, n in enumerate(topn):
            if best_position <= n:
                correct[i] = correct[i] + 1
                
        if printing:
            print "Correct translations:  %s" % targets
            print "Answered by algorithm: %s" % answers
            print "Best translation at position: %s" % best_position
    
    accuracy = [c / test for c in correct]
        
    return accuracy

 def display_translations(training_size=1500, topn=[1, 5, 10], printing=True):
    # Get an array of inputs with their
    translations = get_data()
    if printing:
        print "Dataset size: %s" % len(translations)

    # Let's shuffle the pairs, so we don't always have the same training/test set
    random.shuffle(translations)

    if training_size > len(translations):
        print 'Training size cannot be larger than total dataset size (out of bounds exception)'
        exit()

    training_set = translations[:training_size]
    test_set = translations[training_size:]

    matrix = get_translation_matrix(training_set, printing)
    # print "Finished computing translation matrix"

    return benchmark(test_set, matrix, topn, printing)

 accuracies = {}
 for i in range(500, 3000, 100):
    for n in xrange(10):
        accuracies[i] = display_translations(training_size=i, printing=False)
        print "Training Size: %s, Accuracy @1, 5, 10: %s" % (i, accuracies[i])
	# Translate using a multiple models and a translation matrix
	# Uses a translation matrix to convert the word-vector from one language to the other
	# i.e. vec("koning")*matrix = vec("king")

	import sys
	import getopt
	import collections
	import random
	import numpy as np
	from utils import read_dict, train_tm, apply_tm, score, get_valid_data
	from tsne import bh_sne
	import csv
	import codecs
	import unicodedata
	import re
	import pandas as pd

	from gensim.models import Word2Vec

	# If you want to run with two seperate models:
	#model_nl = Word2Vec.load_word2vec_format('models/nlwiki_lowercase.model', binary=False) # C text format
	#model_en = Word2Vec.load_word2vec_format('models/enwiki_lowercase.model', binary=False, encoding='latin1') # C text format
	# We also did it with different test sets
	#model_nl = Word2Vec.load_word2vec_format('models/nlwiki_lowercase_400.model', binary=False) # C text format
	# model_en = Word2Vec.load_word2vec_format('models/enwiki_lowercase_400.model', binary=False, encoding='latin1') # C text format

	# If you want to run with a single model that's used for both input and output
	model_both = Word2Vec.load_word2vec_format('models/bothwiki_lowercase_400.model', binary=False, encoding='latin1') # C text format
	#model_both = Word2Vec.load_word2vec_format('models/bothwiki_400.model', binary=False, encoding='latin1') # C text format

	# Instead of loading two separate models, store this model in both variables
	model_nl = model_both
	model_en = model_both


	# Find the words closest to the transformed vector
	def top_translations(w, translation_matrix, topn=5):
	vec = model_nl[w].dot(translation_matrix)

	return [i[0] for i in model_en.most_similar([vec], topn=topn)]

	def get_rank(nn, gold):
	for idx, word in enumerate(nn):
	if word in gold:
	return idx + 1
	return idx + 1

	def strip_accents(s):
	return ''.join(c for c in unicodedata.normalize('NFD', unicode(s, "utf-8"))
	if unicodedata.category(c) != 'Mn')

	def clean(dirty):
	# Make alphanumeric characters simpler
	clean = strip_accents(dirty)
	# Remove all remaining weird characters
	return re.sub('[\W_]+', '', clean)

	# print out source word and translation
	def get_data():
	# A map that contains an array of possible translations ('target's) for a source word
	translations = {}

	# Read the training data file
	train_data = read_dict('dutch2.txt')
	for source, target in train_data:
	# Only process if the models know these words
	if source in model_nl and target in model_en and not ' ' in source and not ' ' in target:
	if source in translations:
	translations[source].append(target)
	else:
	translations[source] = [target]

	# Read the CSV file
	word_pairs = codecs.open('did_it_work.csv', 'r', 'utf-8')
	dic2 = pd.read_csv(word_pairs, sep=';', header=None, names=['en', 'nl', 'drop', 'drop', 'drop', 'drop', 'drop'])[['en', 'nl']]

	for n in range(len(dic2['nl'])):
	source = dic2['nl'][n]
	target = dic2['en'][n]

	if source in model_nl and target in model_en and not ' ' in source and not ' ' in target:
	if source in translations:
	translations[source].append(target)
	else:
	translations[source] = [target]

	# Now transform it from a dict to an array
	return [(source, translations[source]) for source in translations]

	def get_translation_matrix(translations, printing=True):
	matrix_train_source = []
	matrix_train_target = []

	for (source, targets) in translations:
	for target in targets:
	matrix_train_source.append(model_nl[source])
	matrix_train_target.append(model_en[target])

	# Matrix W is given in http://stackoverflow.com/questions/27980159/fit-a-linear-transformation-in-python
	# translation_matrix = np.linalg.pinv(matrix_train_source).dot(matrix_train_target).T
	translation_matrix = np.linalg.lstsq(matrix_train_source, matrix_train_target, -1)[0]

	return translation_matrix


	def benchmark(translations, translation_matrix, topn, printing=True):
	correct = np.zeros(len(topn))
	test = 0
	topn.sort()

	for (source, targets) in translations:
	test = test + 1

	answers = top_translations(source, translation_matrix, max(topn)) # check for largest element in the list

	# Now check at what position the best translation is
	# (For each target in targets, check what position it's in in the answers, and take the one with the lowest index)
	best_position = min([(answers.index(target) if target in answers else max(topn) + 1) for target in targets])

	for i, n in enumerate(topn):
	if best_position <= n:
	correct[i] = correct[i] + 1

	if printing:
	print "Correct translations: %s" % targets
	print "Answered by algorithm: %s" % answers
	print "Best translation at position: %s" % best_position

	accuracy = [c / test for c in correct]

	return accuracy

	def display_translations(training_size=1500, topn=[1, 5, 10], printing=True):
	# Get an array of inputs with their
	translations = get_data()
	if printing:
	print "Dataset size: %s" % len(translations)

	# Let's shuffle the pairs, so we don't always have the same training/test set
	random.shuffle(translations)

	if training_size > len(translations):
	print 'Training size cannot be larger than total dataset size (out of bounds exception)'
	exit()

	training_set = translations[:training_size]
	test_set = translations[training_size:]

	matrix = get_translation_matrix(training_set, printing)
	# print "Finished computing translation matrix"

	return benchmark(test_set, matrix, topn, printing)

	accuracies = {}
	for i in range(500, 3000, 100):
	for n in xrange(10):
	accuracies[i] = display_translations(training_size=i, printing=False)
	print "Training Size: %s, Accuracy @1, 5, 10: %s" % (i, accuracies[i])