Created
July 13, 2013 12:38
-
-
Save tuxedocat/5990618 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
def readfile(filepath): | |
""" | |
A function for reading .e2f, .f2e files | |
returns | |
------- | |
l: list | |
if .e2f, ["english word", "f word", "probability"] | |
""" | |
with open(filepath, "r") as f: | |
l = [l.split() for l in f.readlines()] | |
return l | |
def get_vocabulary_from_txt(filepath): | |
""" | |
A function for creating vocabulary list | |
returns | |
------- | |
vocabulary: list | |
["get", "take", ...] | |
""" | |
with open(filepath, "r") as f: | |
vocabulary = [w for w in f.readlines if not (w == "\n" or w == "\t")] | |
return vocabulary | |
def make_parallel_dic(e2f, f2e, vocabulary): | |
""" | |
parameters | |
---------- | |
e2f: list of lists | |
f2e: list of lists | |
vocabulary: list | |
returns | |
------- | |
pdic: dictionary of lists | |
key: english word, value: list of translated words | |
""" | |
pdic = defaultdict(list) | |
e2fd = defaultdict(list) | |
f2ed = defaultdict(list) | |
for l in e2f: | |
if l[0] in vocabulary: | |
e2fd[l[0]].append((l[1], float(l[2]))) | |
else: | |
pass | |
for l in f2e: | |
if l[1] in vocabulary: | |
f2ed[l[0]].append((l[1], float(l[2]))) | |
else: | |
pass | |
for w, flist in e2fd.iteritems(): | |
for fwt in flist: | |
fwl = expand(fwt, f2ed) | |
if fwl: | |
pdic[w] += fwl | |
return pdic | |
def expand(wt, w2wd): | |
tl = w2wd[wt[0]] if wt[0] in w2wd else None | |
res = [] | |
if tl: | |
for t in tl: | |
res.append( (t[0], t[1]*wt[1]) ) | |
return res | |
def postprocess(pd, voc): | |
pd2 = defaultdict(list) | |
for v, cl in pd.iteritems(): | |
if v in voc: | |
_td = defaultdict(float) | |
for t in cl: | |
if t[0] in voc: | |
_td[t[0]]+=t[1] | |
pd2[v] = sorted(_td.items(), key=lambda x: x[1], reverse=True) | |
return pd2 | |
def combine_mtcs(mtcs, lccs): | |
ccs = defaultdict(list) | |
for k, ls in lccs.iteritems(): | |
ccs[k] = [(t[0], 2+float(t[1])) for t in ls] | |
for k, ls in mtcs.iteritems(): | |
if not k in ccs: | |
ccs[k] = ls | |
else: | |
_t = [t for t in ls if not t[0] in [tc[0] for tc in ccs[k]]] | |
ccs[k] += sorted(_t, key=lambda x: x[1], reverse=True) | |
return ccs | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment