Skip to content

Instantly share code, notes, and snippets.

@tuxedocat
Created July 13, 2013 12:38
Show Gist options
  • Save tuxedocat/5990618 to your computer and use it in GitHub Desktop.
Save tuxedocat/5990618 to your computer and use it in GitHub Desktop.
from collections import defaultdict
def readfile(filepath):
"""
A function for reading .e2f, .f2e files
returns
-------
l: list
if .e2f, ["english word", "f word", "probability"]
"""
with open(filepath, "r") as f:
l = [l.split() for l in f.readlines()]
return l
def get_vocabulary_from_txt(filepath):
"""
A function for creating vocabulary list
returns
-------
vocabulary: list
["get", "take", ...]
"""
with open(filepath, "r") as f:
vocabulary = [w for w in f.readlines if not (w == "\n" or w == "\t")]
return vocabulary
def make_parallel_dic(e2f, f2e, vocabulary):
"""
parameters
----------
e2f: list of lists
f2e: list of lists
vocabulary: list
returns
-------
pdic: dictionary of lists
key: english word, value: list of translated words
"""
pdic = defaultdict(list)
e2fd = defaultdict(list)
f2ed = defaultdict(list)
for l in e2f:
if l[0] in vocabulary:
e2fd[l[0]].append((l[1], float(l[2])))
else:
pass
for l in f2e:
if l[1] in vocabulary:
f2ed[l[0]].append((l[1], float(l[2])))
else:
pass
for w, flist in e2fd.iteritems():
for fwt in flist:
fwl = expand(fwt, f2ed)
if fwl:
pdic[w] += fwl
return pdic
def expand(wt, w2wd):
tl = w2wd[wt[0]] if wt[0] in w2wd else None
res = []
if tl:
for t in tl:
res.append( (t[0], t[1]*wt[1]) )
return res
def postprocess(pd, voc):
pd2 = defaultdict(list)
for v, cl in pd.iteritems():
if v in voc:
_td = defaultdict(float)
for t in cl:
if t[0] in voc:
_td[t[0]]+=t[1]
pd2[v] = sorted(_td.items(), key=lambda x: x[1], reverse=True)
return pd2
def combine_mtcs(mtcs, lccs):
ccs = defaultdict(list)
for k, ls in lccs.iteritems():
ccs[k] = [(t[0], 2+float(t[1])) for t in ls]
for k, ls in mtcs.iteritems():
if not k in ccs:
ccs[k] = ls
else:
_t = [t for t in ls if not t[0] in [tc[0] for tc in ccs[k]]]
ccs[k] += sorted(_t, key=lambda x: x[1], reverse=True)
return ccs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment