DarkDimius · April 4, 2013 21:44
diff --git a/gistfile1.py b/gistfile1.py
 from nltk.tokenize import word_tokenize
 from math import erf, log
 import sys
 import numpy


 eng_filename = sys.argv[1]
 ru_filename = sys.argv[2]
 output_filename = sys.argv[3]

 signal_inf = 1000000000
 mean_number_of_symbols_eng_symbol_will_translate_to = 0.96

 prob_of_parallel_sentence = 0.9
 prob_of_loosing_or_creating_sentence = 0.01
 prob_of_translating_single_russian_sentence_as_two = 0.09


 def getBackPointers(ru_lines, eng_lines) :
    D = numpy.ndarray(shape = (len(eng_lines) + 1, len(ru_lines) + 1), dtype = float)
    D.fill(0)
    backsteps_en = numpy.ndarray(shape = (len(eng_lines) + 1, len(ru_lines) + 1), dtype = int)
    backsteps_ru = numpy.ndarray(shape = (len(eng_lines) + 1, len(ru_lines) + 1), dtype = int)


    # iteration to calculate russian_sentences_translated_to-table
    for i in range(len(eng_lines) + 1):
        if i % 50 == 0:
            print(i / 50)

        for j in range(len(ru_lines) + 1):
            if i > 0 or j > 0:
                D[i][j] = signal_inf
                for russian_sentences_translated_to in [(1, 0), (1, 2),(1, 1), (0, 1)]:
                    if i - russian_sentences_translated_to[0] >= 0 and j - russian_sentences_translated_to[1] >= 0:
                        D_may_me_less = D[i - russian_sentences_translated_to[0]][j - russian_sentences_translated_to[1]] +\
                                        score(eng_lines[(i-russian_sentences_translated_to[0]):i], ru_lines[(j-russian_sentences_translated_to[1]):j], russian_sentences_translated_to[0], russian_sentences_translated_to[1])
                        if D_may_me_less < D[i][j]:
                            D[i][j] = D_may_me_less
                            backsteps_en[i][j] = russian_sentences_translated_to[0]
                            backsteps_ru[i][j] = russian_sentences_translated_to[1]
            else:
                D[i][j] = 0.0

    return (backsteps_en,backsteps_ru)

 # cumulative distribution function for the standard normal distribution
 def norm_d(x):
    return (1.0 + erf(x / (2.0 ** 0.5))) / 2.0


 # a priori probability of the translation of k english sentences into l russian
 def a_priory_probability(k, l):
    if (k, l) == (1, 1):
        return prob_of_parallel_sentence
    else:
        if (k,l) == (1, 2):
            return prob_of_translating_single_russian_sentence_as_two
        else:
            if (k, l) == (1, 0) or (k, l) == (0, 1):
                return prob_of_loosing_or_creating_sentence

 sigma = 7.1 ** 0.5

 # score function for a part of alignment:
 # -log Prob{the piece is good} = -(log Prob{|N(0,1)| > |z|} + log prior_prob{n, m})
 # the more it is the less likely is the alignment
 def score(eng_part, ru_part, n, m):
    eng_part_words = []
    ru_part_words = []
    for str in eng_part:
        eng_part_words += word_tokenize(str)
    for str in ru_part:
        ru_part_words += word_tokenize(str)

    eng_sum_len = 0
    ru_sum_len = 0

    for token in eng_part_words:
        eng_sum_len += len(token)
    for token in ru_part_words:
        ru_sum_len += len(token)
    if eng_sum_len > 0:
        # statics is a test statistic derived from the CLT
        statics = min(abs(ru_sum_len - mean_number_of_symbols_eng_symbol_will_translate_to * eng_sum_len) / ((eng_sum_len ** 0.5) * sigma), 3)
    else:
        statics = 0
    return - log(a_priory_probability(n, m)) - log(1 - norm_d(statics) + norm_d(-statics))




 def main():
    ru_lines = open(ru_filename, "r").readlines()
    eng_lines = open(eng_filename, "r").readlines()

    (backpointers_en,backpointers_ru) = getBackPointers(ru_lines, eng_lines)

    # obtaining alignment pairs
    alignment = []
    (i, j) = (len(eng_lines), len(ru_lines))
    while (i, j) != (0, 0):
    #        print i
        alignment.append((backpointers_en[i][j], backpointers_ru[i][j]))
        (i,j) = (i - backpointers_en[i][j], j - backpointers_ru[i][j])
    alignment.reverse()

    output = open(output_filename, 'w')
    for eng_ru in alignment:
        for s in eng_lines[i:(i+eng_ru[0])]:
            output.write(s.rstrip('\r\n'))
        output.write(' ||| ')
        for s in ru_lines[j:(j+eng_ru[1])]:
            output.write(s.rstrip('\r\n'))
        output.write('\n')
        i += eng_ru[0]
        j += eng_ru[1]
    output.close()

 if __name__ == "__main__":
    main()
	from nltk.tokenize import word_tokenize
	from math import erf, log
	import sys
	import numpy


	eng_filename = sys.argv[1]
	ru_filename = sys.argv[2]
	output_filename = sys.argv[3]

	signal_inf = 1000000000
	mean_number_of_symbols_eng_symbol_will_translate_to = 0.96

	prob_of_parallel_sentence = 0.9
	prob_of_loosing_or_creating_sentence = 0.01
	prob_of_translating_single_russian_sentence_as_two = 0.09


	def getBackPointers(ru_lines, eng_lines) :
	D = numpy.ndarray(shape = (len(eng_lines) + 1, len(ru_lines) + 1), dtype = float)
	D.fill(0)
	backsteps_en = numpy.ndarray(shape = (len(eng_lines) + 1, len(ru_lines) + 1), dtype = int)
	backsteps_ru = numpy.ndarray(shape = (len(eng_lines) + 1, len(ru_lines) + 1), dtype = int)


	# iteration to calculate russian_sentences_translated_to-table
	for i in range(len(eng_lines) + 1):
	if i % 50 == 0:
	print(i / 50)

	for j in range(len(ru_lines) + 1):
	if i > 0 or j > 0:
	D[i][j] = signal_inf
	for russian_sentences_translated_to in [(1, 0), (1, 2),(1, 1), (0, 1)]:
	if i - russian_sentences_translated_to[0] >= 0 and j - russian_sentences_translated_to[1] >= 0:
	D_may_me_less = D[i - russian_sentences_translated_to[0]][j - russian_sentences_translated_to[1]] +\
	score(eng_lines[(i-russian_sentences_translated_to[0]):i], ru_lines[(j-russian_sentences_translated_to[1]):j], russian_sentences_translated_to[0], russian_sentences_translated_to[1])
	if D_may_me_less < D[i][j]:
	D[i][j] = D_may_me_less
	backsteps_en[i][j] = russian_sentences_translated_to[0]
	backsteps_ru[i][j] = russian_sentences_translated_to[1]
	else:
	D[i][j] = 0.0

	return (backsteps_en,backsteps_ru)

	# cumulative distribution function for the standard normal distribution
	def norm_d(x):
	return (1.0 + erf(x / (2.0 ** 0.5))) / 2.0


	# a priori probability of the translation of k english sentences into l russian
	def a_priory_probability(k, l):
	if (k, l) == (1, 1):
	return prob_of_parallel_sentence
	else:
	if (k,l) == (1, 2):
	return prob_of_translating_single_russian_sentence_as_two
	else:
	if (k, l) == (1, 0) or (k, l) == (0, 1):
	return prob_of_loosing_or_creating_sentence

	sigma = 7.1 ** 0.5

	# score function for a part of alignment:
	# -log Prob{the piece is good} = -(log Prob{\|N(0,1)\| > \|z\|} + log prior_prob{n, m})
	# the more it is the less likely is the alignment
	def score(eng_part, ru_part, n, m):
	eng_part_words = []
	ru_part_words = []
	for str in eng_part:
	eng_part_words += word_tokenize(str)
	for str in ru_part:
	ru_part_words += word_tokenize(str)

	eng_sum_len = 0
	ru_sum_len = 0

	for token in eng_part_words:
	eng_sum_len += len(token)
	for token in ru_part_words:
	ru_sum_len += len(token)
	if eng_sum_len > 0:
	# statics is a test statistic derived from the CLT
	statics = min(abs(ru_sum_len - mean_number_of_symbols_eng_symbol_will_translate_to * eng_sum_len) / ((eng_sum_len ** 0.5) * sigma), 3)
	else:
	statics = 0
	return - log(a_priory_probability(n, m)) - log(1 - norm_d(statics) + norm_d(-statics))




	def main():
	ru_lines = open(ru_filename, "r").readlines()
	eng_lines = open(eng_filename, "r").readlines()

	(backpointers_en,backpointers_ru) = getBackPointers(ru_lines, eng_lines)

	# obtaining alignment pairs
	alignment = []
	(i, j) = (len(eng_lines), len(ru_lines))
	while (i, j) != (0, 0):
	# print i
	alignment.append((backpointers_en[i][j], backpointers_ru[i][j]))
	(i,j) = (i - backpointers_en[i][j], j - backpointers_ru[i][j])
	alignment.reverse()

	output = open(output_filename, 'w')
	for eng_ru in alignment:
	for s in eng_lines[i:(i+eng_ru[0])]:
	output.write(s.rstrip('\r\n'))
	output.write(' \|\|\| ')
	for s in ru_lines[j:(j+eng_ru[1])]:
	output.write(s.rstrip('\r\n'))
	output.write('\n')
	i += eng_ru[0]
	j += eng_ru[1]
	output.close()

	if __name__ == "__main__":
	main()