Created
February 18, 2014 22:46
-
-
Save hans/9082054 to your computer and use it in GitHub Desktop.
Implementation of the IBM model 1 expectation-maximization algorithm for learning word alignments.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""An implementation of the IBM Model 1 expectation-maximization | |
algorithm for learning word alignments. | |
""" | |
from collections import defaultdict | |
import copy | |
import itertools | |
import operator | |
def em_run(sentence_pairs): | |
"""Run expectation-maximization on a list of pairs of the form | |
`(source_tokens, target_tokens)` | |
where `source_tokens` is a list of tokens in the source language and | |
`target_tokens` is a list of tokens for a translationally equivalent | |
sentence in the target language. | |
Returns a mapping `(t1, t2) => p` where `t1` is a source-language | |
token, `t2` is a target-language token, and the value `p` represents | |
$P(t1|t2)$. | |
""" | |
source_sentences, target_sentences = zip(*sentence_pairs) | |
source_vocabulary = set(itertools.chain.from_iterable(source_sentences)) | |
target_vocabulary = set(itertools.chain.from_iterable(target_sentences)) | |
# Value with which to initialize each conditional probability | |
uniform_prob = 1.0 / len(source_vocabulary) | |
conditional_probs_old = None | |
conditional_probs = {(source_w, target_w): uniform_prob | |
for source_w in source_vocabulary | |
for target_w in target_vocabulary} | |
alignments = [[zip(source, target_perm) | |
for target_perm in itertools.permutations(target)] | |
for source, target in sentence_pairs] | |
# Repeat until convergence | |
i = 0 | |
while conditional_probs_old != conditional_probs: | |
conditional_probs_old = copy.copy(conditional_probs) | |
alignment_probs = { | |
i: { | |
tuple(alignment): | |
reduce(operator.mul, [conditional_probs[pair] | |
for pair in alignment]) | |
for alignment in sentence_alignments | |
} | |
for i, sentence_alignments in enumerate(alignments) | |
} | |
# Normalize alignment probabilities | |
for sentence_idx, sentence_alignments in alignment_probs.iteritems(): | |
total = float(sum(sentence_alignments.values())) | |
probs = {alignment: value / total | |
for alignment, value in sentence_alignments.iteritems()} | |
alignment_probs[sentence_idx] = probs | |
# Now join all alignments and begin the maximization step: group | |
# by target-language word and collect corresponding | |
# source-language probabilities | |
word_translations = defaultdict(lambda: defaultdict(float)) | |
for sentence_alignments in alignment_probs.itervalues(): | |
for word_pairs, prob in sentence_alignments.iteritems(): | |
for source_word, target_word in word_pairs: | |
word_translations[target_word][source_word] += prob | |
# Now calculate new conditional probability mapping, ungrouping | |
# the `word_translations` tree and normalizing values into | |
# conditional probabilities | |
conditional_probs = {} | |
for target_word, translations in word_translations.iteritems(): | |
total = float(sum(translations.values())) | |
for source_word, score in translations.iteritems(): | |
conditional_probs[source_word, target_word] = score / total | |
return conditional_probs | |
def main(): | |
SENTENCES = [ | |
('mi casa verde'.split(), 'my green house'.split()), | |
('casa verde'.split(), 'green house'.split()), | |
('la casa'.split(), 'the house'.split()), | |
] | |
print em_run(SENTENCES) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment