Skip to content

Instantly share code, notes, and snippets.

@DarkDimius
Created April 4, 2013 21:44
Show Gist options
  • Save DarkDimius/5314650 to your computer and use it in GitHub Desktop.
Save DarkDimius/5314650 to your computer and use it in GitHub Desktop.
from nltk.tokenize import word_tokenize
from math import erf, log
import sys
import numpy
eng_filename = sys.argv[1]
ru_filename = sys.argv[2]
output_filename = sys.argv[3]
signal_inf = 1000000000
mean_number_of_symbols_eng_symbol_will_translate_to = 0.96
prob_of_parallel_sentence = 0.9
prob_of_loosing_or_creating_sentence = 0.01
prob_of_translating_single_russian_sentence_as_two = 0.09
def getBackPointers(ru_lines, eng_lines) :
D = numpy.ndarray(shape = (len(eng_lines) + 1, len(ru_lines) + 1), dtype = float)
D.fill(0)
backsteps_en = numpy.ndarray(shape = (len(eng_lines) + 1, len(ru_lines) + 1), dtype = int)
backsteps_ru = numpy.ndarray(shape = (len(eng_lines) + 1, len(ru_lines) + 1), dtype = int)
# iteration to calculate russian_sentences_translated_to-table
for i in range(len(eng_lines) + 1):
if i % 50 == 0:
print(i / 50)
for j in range(len(ru_lines) + 1):
if i > 0 or j > 0:
D[i][j] = signal_inf
for russian_sentences_translated_to in [(1, 0), (1, 2),(1, 1), (0, 1)]:
if i - russian_sentences_translated_to[0] >= 0 and j - russian_sentences_translated_to[1] >= 0:
D_may_me_less = D[i - russian_sentences_translated_to[0]][j - russian_sentences_translated_to[1]] +\
score(eng_lines[(i-russian_sentences_translated_to[0]):i], ru_lines[(j-russian_sentences_translated_to[1]):j], russian_sentences_translated_to[0], russian_sentences_translated_to[1])
if D_may_me_less < D[i][j]:
D[i][j] = D_may_me_less
backsteps_en[i][j] = russian_sentences_translated_to[0]
backsteps_ru[i][j] = russian_sentences_translated_to[1]
else:
D[i][j] = 0.0
return (backsteps_en,backsteps_ru)
# cumulative distribution function for the standard normal distribution
def norm_d(x):
return (1.0 + erf(x / (2.0 ** 0.5))) / 2.0
# a priori probability of the translation of k english sentences into l russian
def a_priory_probability(k, l):
if (k, l) == (1, 1):
return prob_of_parallel_sentence
else:
if (k,l) == (1, 2):
return prob_of_translating_single_russian_sentence_as_two
else:
if (k, l) == (1, 0) or (k, l) == (0, 1):
return prob_of_loosing_or_creating_sentence
sigma = 7.1 ** 0.5
# score function for a part of alignment:
# -log Prob{the piece is good} = -(log Prob{|N(0,1)| > |z|} + log prior_prob{n, m})
# the more it is the less likely is the alignment
def score(eng_part, ru_part, n, m):
eng_part_words = []
ru_part_words = []
for str in eng_part:
eng_part_words += word_tokenize(str)
for str in ru_part:
ru_part_words += word_tokenize(str)
eng_sum_len = 0
ru_sum_len = 0
for token in eng_part_words:
eng_sum_len += len(token)
for token in ru_part_words:
ru_sum_len += len(token)
if eng_sum_len > 0:
# statics is a test statistic derived from the CLT
statics = min(abs(ru_sum_len - mean_number_of_symbols_eng_symbol_will_translate_to * eng_sum_len) / ((eng_sum_len ** 0.5) * sigma), 3)
else:
statics = 0
return - log(a_priory_probability(n, m)) - log(1 - norm_d(statics) + norm_d(-statics))
def main():
ru_lines = open(ru_filename, "r").readlines()
eng_lines = open(eng_filename, "r").readlines()
(backpointers_en,backpointers_ru) = getBackPointers(ru_lines, eng_lines)
# obtaining alignment pairs
alignment = []
(i, j) = (len(eng_lines), len(ru_lines))
while (i, j) != (0, 0):
# print i
alignment.append((backpointers_en[i][j], backpointers_ru[i][j]))
(i,j) = (i - backpointers_en[i][j], j - backpointers_ru[i][j])
alignment.reverse()
output = open(output_filename, 'w')
for eng_ru in alignment:
for s in eng_lines[i:(i+eng_ru[0])]:
output.write(s.rstrip('\r\n'))
output.write(' ||| ')
for s in ru_lines[j:(j+eng_ru[1])]:
output.write(s.rstrip('\r\n'))
output.write('\n')
i += eng_ru[0]
j += eng_ru[1]
output.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment