Last active
December 14, 2015 03:39
-
-
Save bhaskara/5022706 to your computer and use it in GitHub Desktop.
rewriting.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
def best_rewrites(q, n=10): | |
"""Return N best rewrites of query Q. | |
Q: string of space-separated words | |
Also depends on the two dictionaries frequencies and substitutions.""" | |
l = [(frequencies.get(r, 0), r) for r in joint_rewrites(q)] | |
return sorted(l, reverse=True)[:n] | |
def joint_rewrites(q, max_num_rewrites=2): | |
"""Given a query, return all ways to rewrite it using at most two substitutions. | |
Uses the global dictionary substitutions. Will only search over non-overlapping | |
substitutions.""" | |
if isinstance(q, str): | |
q = tuple(q.split()) | |
yield q | |
candidates = single_rewrites(q) | |
for i, l in candidates.iteritems(): | |
for n, p in l: | |
yield replace(q, [(i, n, p)]) | |
for i2 in range(i+n, len(q)): | |
for n2, p2 in candidates[i2]: | |
yield replace(q, [(i, n, p), (i2, n2, p2)]) | |
def replace(q, subs): | |
"""Each sub must be of the form (I, N, P), meaning replace words I, ... I+N-1 | |
with phrase P. Subs must be nonoverlapping and ordered by I.""" | |
out = list(q) | |
d = 0 | |
for i, n, p in subs: | |
phrase = p.split() | |
out[i+d:i+n+d] = phrase | |
d += len(phrase)-n | |
return tuple(out) | |
def single_rewrites(q): | |
"""Return dictionary mapping each integer I to a list of tuples of the form | |
(N, P). Each such tuple means that words I, ..., I+N-1 can be replaced by the | |
phrase P.""" | |
rewrites = defaultdict(list) | |
n = len(q) | |
if n>2: | |
full_rewrites = substitutions.get(q, []) | |
for sub in full_rewrites: | |
rewrites[0].append((n, sub)) | |
for l in range(1, 3): | |
for i in range(n+1-l): | |
subs = substitutions.get(tuple(q[i:i+l]), []) | |
for sub in subs: | |
rewrites[i].append((l, sub)) | |
return rewrites | |
############################################################ | |
# Test data | |
############################################################ | |
frequencies = { | |
('data', 'science', 'jobs'): 100, | |
('data', 'analyst', 'positions'): 200, | |
('information', 'science', 'jobs'): 10, | |
('interpretive', 'dance'): 44, | |
('data', 'science', 'positions'): 25, | |
('information', 'technology', 'jobs'): 15, | |
('information', 'technology', 'vacancies'): 33, | |
} | |
substitutions = { | |
('data', 'science'): ['data mining', 'information'], | |
('data',): ['information'], | |
('science',): ['technology', 'research and development'], | |
('jobs',): ['positions', 'vacancies'], | |
('temporary',): ['temp', 'contract'] | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment