bhaskara · December 14, 2015 03:39
diff --git a/rewriting.py b/rewriting.py
 from collections import defaultdict



 def best_rewrites(q, n=10):
  """Return N best rewrites of query Q.
  
  Q: string of space-separated words
  
  Also depends on the two dictionaries frequencies and substitutions."""
  l = [(frequencies.get(r, 0), r) for r in joint_rewrites(q)]
  return sorted(l, reverse=True)[:n]


 def joint_rewrites(q, max_num_rewrites=2):
  """Given a query, return all ways to rewrite it using at most two substitutions.
  
  Uses the global dictionary substitutions.  Will only search over non-overlapping 
  substitutions."""
  if isinstance(q, str):
    q = tuple(q.split())
  yield q
  candidates = single_rewrites(q)
  for i, l in candidates.iteritems():
    for n, p in l:
      yield replace(q, [(i, n, p)])
      for i2 in range(i+n, len(q)):
        for n2, p2 in candidates[i2]:
          yield replace(q, [(i, n, p), (i2, n2, p2)])


 def replace(q, subs):
  """Each sub must be of the form (I, N, P), meaning replace words I, ... I+N-1
  with phrase P.  Subs must be nonoverlapping and ordered by I."""
  out = list(q)
  d = 0
  for i, n, p in subs:
    phrase = p.split()
    out[i+d:i+n+d] = phrase
    d += len(phrase)-n
  return tuple(out)


 def single_rewrites(q):
  """Return dictionary mapping each integer I to a list of tuples of the form
  (N, P).  Each such tuple means that words I, ..., I+N-1 can be replaced by the
  phrase P."""
  rewrites = defaultdict(list)
  n = len(q)
  if n>2:
    full_rewrites = substitutions.get(q, [])
    for sub in full_rewrites:
      rewrites[0].append((n, sub))

  for l in range(1, 3):
    for i in range(n+1-l):
      subs = substitutions.get(tuple(q[i:i+l]), [])
      for sub in subs:
        rewrites[i].append((l, sub))

  return rewrites
  


 ############################################################
 # Test data
 ############################################################

 frequencies = {
  ('data', 'science', 'jobs'): 100,
  ('data', 'analyst', 'positions'): 200,
  ('information', 'science', 'jobs'): 10,
  ('interpretive', 'dance'): 44,
  ('data', 'science', 'positions'): 25,
  ('information', 'technology', 'jobs'): 15,
  ('information', 'technology', 'vacancies'): 33,
  }

 substitutions = {
  ('data', 'science'): ['data mining', 'information'],
  ('data',): ['information'],
  ('science',): ['technology', 'research and development'],
  ('jobs',): ['positions', 'vacancies'],
  ('temporary',): ['temp', 'contract']
  }
	from collections import defaultdict



	def best_rewrites(q, n=10):
	"""Return N best rewrites of query Q.

	Q: string of space-separated words

	Also depends on the two dictionaries frequencies and substitutions."""
	l = [(frequencies.get(r, 0), r) for r in joint_rewrites(q)]
	return sorted(l, reverse=True)[:n]


	def joint_rewrites(q, max_num_rewrites=2):
	"""Given a query, return all ways to rewrite it using at most two substitutions.

	Uses the global dictionary substitutions. Will only search over non-overlapping
	substitutions."""
	if isinstance(q, str):
	q = tuple(q.split())
	yield q
	candidates = single_rewrites(q)
	for i, l in candidates.iteritems():
	for n, p in l:
	yield replace(q, [(i, n, p)])
	for i2 in range(i+n, len(q)):
	for n2, p2 in candidates[i2]:
	yield replace(q, [(i, n, p), (i2, n2, p2)])


	def replace(q, subs):
	"""Each sub must be of the form (I, N, P), meaning replace words I, ... I+N-1
	with phrase P. Subs must be nonoverlapping and ordered by I."""
	out = list(q)
	d = 0
	for i, n, p in subs:
	phrase = p.split()
	out[i+d:i+n+d] = phrase
	d += len(phrase)-n
	return tuple(out)


	def single_rewrites(q):
	"""Return dictionary mapping each integer I to a list of tuples of the form
	(N, P). Each such tuple means that words I, ..., I+N-1 can be replaced by the
	phrase P."""
	rewrites = defaultdict(list)
	n = len(q)
	if n>2:
	full_rewrites = substitutions.get(q, [])
	for sub in full_rewrites:
	rewrites[0].append((n, sub))

	for l in range(1, 3):
	for i in range(n+1-l):
	subs = substitutions.get(tuple(q[i:i+l]), [])
	for sub in subs:
	rewrites[i].append((l, sub))

	return rewrites



	############################################################
	# Test data
	############################################################

	frequencies = {
	('data', 'science', 'jobs'): 100,
	('data', 'analyst', 'positions'): 200,
	('information', 'science', 'jobs'): 10,
	('interpretive', 'dance'): 44,
	('data', 'science', 'positions'): 25,
	('information', 'technology', 'jobs'): 15,
	('information', 'technology', 'vacancies'): 33,
	}

	substitutions = {
	('data', 'science'): ['data mining', 'information'],
	('data',): ['information'],
	('science',): ['technology', 'research and development'],
	('jobs',): ['positions', 'vacancies'],
	('temporary',): ['temp', 'contract']
	}