hans · February 18, 2014 22:46
diff --git a/model1_em.py b/model1_em.py
 #!/usr/bin/env python
 """An implementation of the IBM Model 1 expectation-maximization
 algorithm for learning word alignments.
 """

 from collections import defaultdict
 import copy
 import itertools
 import operator


 def em_run(sentence_pairs):
    """Run expectation-maximization on a list of pairs of the form

        `(source_tokens, target_tokens)`

    where `source_tokens` is a list of tokens in the source language and
    `target_tokens` is a list of tokens for a translationally equivalent
    sentence in the target language.

    Returns a mapping `(t1, t2) => p` where `t1` is a source-language
    token, `t2` is a target-language token, and the value `p` represents
    $P(t1|t2)$.
    """

    source_sentences, target_sentences = zip(*sentence_pairs)
    source_vocabulary = set(itertools.chain.from_iterable(source_sentences))
    target_vocabulary = set(itertools.chain.from_iterable(target_sentences))

    # Value with which to initialize each conditional probability
    uniform_prob = 1.0 / len(source_vocabulary)

    conditional_probs_old = None
    conditional_probs = {(source_w, target_w): uniform_prob
                         for source_w in source_vocabulary
                         for target_w in target_vocabulary}

    alignments = [[zip(source, target_perm)
                   for target_perm in itertools.permutations(target)]
                  for source, target in sentence_pairs]

    # Repeat until convergence
    i = 0
    while conditional_probs_old != conditional_probs:
        conditional_probs_old = copy.copy(conditional_probs)

        alignment_probs = {
            i: {
                tuple(alignment):
                reduce(operator.mul, [conditional_probs[pair]
                                      for pair in alignment])
                for alignment in sentence_alignments
            }

            for i, sentence_alignments in enumerate(alignments)
        }

        # Normalize alignment probabilities
        for sentence_idx, sentence_alignments in alignment_probs.iteritems():
            total = float(sum(sentence_alignments.values()))
            probs = {alignment: value / total
                     for alignment, value in sentence_alignments.iteritems()}
            alignment_probs[sentence_idx] = probs

        # Now join all alignments and begin the maximization step: group
        # by target-language word and collect corresponding
        # source-language probabilities
        word_translations = defaultdict(lambda: defaultdict(float))
        for sentence_alignments in alignment_probs.itervalues():
            for word_pairs, prob in sentence_alignments.iteritems():
                for source_word, target_word in word_pairs:
                    word_translations[target_word][source_word] += prob

        # Now calculate new conditional probability mapping, ungrouping
        # the `word_translations` tree and normalizing values into
        # conditional probabilities
        conditional_probs = {}
        for target_word, translations in word_translations.iteritems():
            total = float(sum(translations.values()))
            for source_word, score in translations.iteritems():
                conditional_probs[source_word, target_word] = score / total

    return conditional_probs


 def main():
    SENTENCES = [
        ('mi casa verde'.split(), 'my green house'.split()),
        ('casa verde'.split(), 'green house'.split()),
        ('la casa'.split(), 'the house'.split()),
    ]

    print em_run(SENTENCES)


 if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	"""An implementation of the IBM Model 1 expectation-maximization
	algorithm for learning word alignments.
	"""

	from collections import defaultdict
	import copy
	import itertools
	import operator


	def em_run(sentence_pairs):
	"""Run expectation-maximization on a list of pairs of the form

	`(source_tokens, target_tokens)`

	where `source_tokens` is a list of tokens in the source language and
	`target_tokens` is a list of tokens for a translationally equivalent
	sentence in the target language.

	Returns a mapping `(t1, t2) => p` where `t1` is a source-language
	token, `t2` is a target-language token, and the value `p` represents
	$P(t1\|t2)$.
	"""

	source_sentences, target_sentences = zip(*sentence_pairs)
	source_vocabulary = set(itertools.chain.from_iterable(source_sentences))
	target_vocabulary = set(itertools.chain.from_iterable(target_sentences))

	# Value with which to initialize each conditional probability
	uniform_prob = 1.0 / len(source_vocabulary)

	conditional_probs_old = None
	conditional_probs = {(source_w, target_w): uniform_prob
	for source_w in source_vocabulary
	for target_w in target_vocabulary}

	alignments = [[zip(source, target_perm)
	for target_perm in itertools.permutations(target)]
	for source, target in sentence_pairs]

	# Repeat until convergence
	i = 0
	while conditional_probs_old != conditional_probs:
	conditional_probs_old = copy.copy(conditional_probs)

	alignment_probs = {
	i: {
	tuple(alignment):
	reduce(operator.mul, [conditional_probs[pair]
	for pair in alignment])
	for alignment in sentence_alignments
	}

	for i, sentence_alignments in enumerate(alignments)
	}

	# Normalize alignment probabilities
	for sentence_idx, sentence_alignments in alignment_probs.iteritems():
	total = float(sum(sentence_alignments.values()))
	probs = {alignment: value / total
	for alignment, value in sentence_alignments.iteritems()}
	alignment_probs[sentence_idx] = probs

	# Now join all alignments and begin the maximization step: group
	# by target-language word and collect corresponding
	# source-language probabilities
	word_translations = defaultdict(lambda: defaultdict(float))
	for sentence_alignments in alignment_probs.itervalues():
	for word_pairs, prob in sentence_alignments.iteritems():
	for source_word, target_word in word_pairs:
	word_translations[target_word][source_word] += prob

	# Now calculate new conditional probability mapping, ungrouping
	# the `word_translations` tree and normalizing values into
	# conditional probabilities
	conditional_probs = {}
	for target_word, translations in word_translations.iteritems():
	total = float(sum(translations.values()))
	for source_word, score in translations.iteritems():
	conditional_probs[source_word, target_word] = score / total

	return conditional_probs


	def main():
	SENTENCES = [
	('mi casa verde'.split(), 'my green house'.split()),
	('casa verde'.split(), 'green house'.split()),
	('la casa'.split(), 'the house'.split()),
	]

	print em_run(SENTENCES)


	if __name__ == '__main__':
	main()