Not-so-minimal anymore (check the early commits) beam search example
# Author: Kyle Kastner
# License: BSD 3-Clause
# See core implementations here
# Also includes a reduction of the post by Yoav Goldberg to a script
# These datasets can be a lot of fun...
# python 2600_phrases_for_effective_performance_reviews.txt -o 5 -d 0
# Download kjv.txt from
# python kjv.txt -o 5 -d 2 -r 2145
# Snippet:
# Queen ording found Raguel: I kill.
# NUN.
import numpy as np
import heapq
from collections import defaultdict, Counter
import collections
import os
import sys
import argparse
import cPickle as pickle
import time
from itertools import izip
class Beam(object):
For comparison of prefixes, the tuple (prefix_probability, complete_sentence) is used.
This is so that if two prefixes have equal probabilities then a complete sentence is preferred
over an incomplete one since (0.5, False, whatever_prefix) < (0.5, True, some_other_prefix)
def __init__(self, beam_width, init_beam=None, use_log=True,
stochastic=False, temperature=1.0, random_state=None):
if init_beam is None:
self.heap = list()
self.heap = init_beam
self.stochastic = stochastic
self.random_state = random_state
self.temperature = temperature
# use_log currently unused...
self.use_log = use_log
self.beam_width = beam_width
def add(self, score, complete, prob, prefix):
heapq.heappush(self.heap, (score, complete, prob, prefix))
while len(self.heap) > self.beam_width:
if self.stochastic:
# same whether logspace or no?
probs = np.array([h[0] for h in self.heap])
probs = probs / self.temperature
e_x = np.exp(probs - np.max(probs))
s_x = e_x / e_x.sum()
is_x = 1. - s_x
is_x = is_x / is_x.sum()
to_remove = self.random_state.multinomial(1, is_x).argmax()
completed = [n for n, h in enumerate(self.heap) if h[1] == True]
# Don't remove completed sentences by randomness
if to_remove not in completed:
# there must be a faster way...
# remove lowest score from heap
def __iter__(self):
return iter(self.heap)
def beamsearch(probabilities_function, beam_width=10, clip_len=-1,
start_token="<START>", end_token="<EOS>", use_log=True,
renormalize=True, length_score=True,
stochastic=False, temperature=1.0,
random_state=None, eps=1E-9):
returns a generator, which will yield beamsearched sequences in order of their probability
"probabilities_function" returns a list of (next_prob, next_word) pairs given a prefix.
"beam_width" is the number of prefixes to keep (so that instead of keeping the top 10 prefixes you can keep the top 100 for example).
By making the beam search bigger you can get closer to the actual most probable sentence but it would also take longer to process.
"clip_len" is a maximum length to tolerate, beyond which the most probable prefix is returned as an incomplete sentence.
Without a maximum length, a faulty probabilities function which does not return a highly probable end token
will lead to an infinite loop or excessively long garbage sentences.
"start_token" can be a single string (token), or a sequence of tokens
"end_token" is a single string (token), or a sequence of tokens that signifies end of the sequence
"use_log, renormalize, length_score" are all related to calculation of beams to keep
and should improve results when True
"stochastic" uses a different sampling algorithm for reducing/aggregating beams
it should result in more diverse and interesting outputs
"temperature" is the softmax temperature for the underlying stochastic
beamsearch - the default of 1.0 is usually fine
"random_state" is a np.random.RandomState() object, passed when using the
stochastic beamsearch in order to control randomness
"eps" minimum probability for log-space calculations, to avoid numerical issues
if stochastic:
if random_state is None:
raise ValueError("Must pass np.random.RandomState() object if stochastic=True")
completed_beams = 0
prev_beam = Beam(beam_width - completed_beams, None, use_log, stochastic,
temperature, random_state)
except NameError:
basestring = str
if isinstance(start_token, collections.Sequence) and not isinstance(start_token, basestring):
start_token_is_seq = True
# make it a list with 1 entry
start_token = [start_token]
start_token_is_seq = False
if isinstance(end_token, collections.Sequence) and not isinstance(end_token, basestring):
end_token_is_seq = True
# make it a list with 1 entry
end_token = [end_token]
end_token_is_seq = False
if use_log:
prev_beam.add(.0, False, .0, start_token)
prev_beam.add(1.0, False, 1.0, start_token)
while True:
curr_beam = Beam(beam_width - completed_beams, None, use_log, stochastic,
temperature, random_state)
if renormalize:
sorted_prev_beam = sorted(prev_beam)
# renormalize by the previous minimum value in the beam
min_prob = sorted_prev_beam[0][0]
if use_log:
min_prob = 0.
min_prob = 1.
if diversity_score:
# get prefixes
pre = [r[-1][len(start_token):] for r in prev_beam]
base = set(pre[0])
diversity_scores = []
# score for first entry
if use_log:
if len(pre) > 1:
for pre_i in pre[1:]:
s = set(pre_i)
union = base | s
# number of new things + (- number of repetitions)
sc = (len(union) - len(base)) - (len(pre_i) - len(s))
# update it
base = union
if use_log:
# Add complete sentences that do not yet have the best probability to the current beam, the rest prepare to add more words to them.
for ni, (prefix_score, complete, prefix_prob, prefix) in enumerate(prev_beam):
if complete == True:
curr_beam.add(prefix_score, True, prefix_prob, prefix)
# Get probability of each possible next word for the incomplete prefix
for (next_prob, next_word) in probabilities_function(prefix):
# use eps tolerance to avoid log(0.) issues
if next_prob > eps:
n = next_prob
n = eps
# score is renormalized prob
if use_log:
if length_score:
score = prefix_prob + np.log(n) + np.log(len(prefix)) - min_prob
score = prefix_prob + np.log(n) - min_prob
if diversity_score:
score = score + diversity_scores[ni]
prob = prefix_prob + np.log(n)
if length_score:
score = (prefix_prob * n * len(prefix)) / min_prob
score = (prefix_prob * n) / min_prob
if diversity_score:
score = score * diversity_scores[ni]
prob = prefix_prob * n
if end_token_is_seq:
left_cmp = prefix[-len(end_token) + 1:] + [next_word]
right_cmp = end_token
left_cmp = next_word
right_cmp = end_token
if left_cmp == right_cmp:
# If next word is the end token then mark prefix as complete
curr_beam.add(score, True, prob, prefix + [next_word])
curr_beam.add(score, False, prob, prefix + [next_word])
# Get all prefixes in beam sorted by probability
sorted_beam = sorted(curr_beam)
any_removals = False
while True:
# Get highest probability prefix - heapq is sorted in ascending order
(best_score, best_complete, best_prob, best_prefix) = sorted_beam[-1]
if best_complete == True or len(best_prefix) - 1 == clip_len:
# If most probable prefix is a complete sentence or has a length that
# exceeds the clip length (ignoring the start token) then return it
# yield best without start token, along with probability
if start_token_is_seq:
skip = len(start_token)
skip = 1
if end_token_is_seq:
stop = None
stop = -1
yield (best_prefix[skip:stop], best_score, best_prob)
completed_beams += 1
any_removals = True
# If there are no more sentences in the beam then stop checking
if len(sorted_beam) == 0:
if any_removals == True:
if len(sorted_beam) == 0:
prev_beam = Beam(beam_width - completed_beams, sorted_beam, use_log,
stochastic, temperature, random_state)
prev_beam = curr_beam
# Reduce memory on python 2
if sys.version_info < (3, 0):
range = xrange
def train_char_lm(fname, order=4, temperature=1.0):
data = file(fname).read()
lm = defaultdict(Counter)
pad = "~" * order
data = pad + data
for i in range(len(data) - order):
history, char = data[i:i + order], data[i + order]
lm[history][char] += 1
def normalize(counter):
# Use a proper softmax with temperature
t = temperature
ck = counter.keys()
cv = counter.values()
# Keep it in log space
s = float(sum([pi for pi in cv]))
# 0 to 1 to help numerical issues
p = [pi / s for pi in cv]
# log_space
p = [pi / float(t) for pi in p]
mx = max(p)
# log sum exp
s_p = mx + np.log(sum([np.exp(pi - mx) for pi in p]))
# Calculate softmax in a hopefully more stable way
# s(xi) = exp ^ (xi / t) / sum exp ^ (xi / t)
# log s(xi) = log (exp ^ (xi / t) / sum exp ^ (xi / t))
# log s(xi) = log exp ^ (xi / t) - log sum exp ^ (xi / t)
# with pi = xi / t
# with s_p = log sum exp ^ (xi / t)
# log s(xi) = pi - s_p
# s(xi) = np.exp(pi - s_p)
p = [np.exp(pi - s_p) for pi in p]
return [(pi, ci) for ci, pi in zip(ck, p)]
outlm = {hist: normalize(chars) for hist, chars in lm.iteritems()}
return outlm
def generate_letter(lm, history, order, stochastic, random_state):
history = history[-order:]
dist = lm[history]
if stochastic:
x = random_state.rand()
for v, c in dist:
x = x - v
if x <= 0:
return c
# randomize choice if it all failed
li = list(range(len(dist)))
_, c = dist[li[0]]
probs = np.array([d[0] for d in dist])
ii = np.argmax(probs)
_, c = dist[ii]
return c
def step_text(lm, order, stochastic, random_seed, history=None, end=None,
beam_width=1, n_letters=1000, verbose=False):
# beam_width argument is ignored, as is end, and verbose
if history is None or history == "<START>":
history = "~" * order
history = "".join(history).decode("string_escape")
out = []
random_state = np.random.RandomState(random_seed)
for i in range(n_letters):
c = generate_letter(lm, history, order, stochastic, random_state)
history = history[-order:] + c
# return list to match beam_text
return ["".join(out)]
def beam_text(lm, order, stochastic, random_seed, history=None,
end=None, beam_width=10, n_letters=1000, verbose=False):
def pf(prefix):
history = prefix[-order:]
# lm wants key as a single string
k = "".join(history).decode("string_escape")
# sometimes the distribution "dead-ends"...
dist = lm[k]
except KeyError:
alt_keys = [i for i in lm.keys()
if "".join(prefix[-order:-1]) in i
and "".join(prefix[-order-1:-1]) != i]
# if no alternates, start from a random place
if len(alt_keys) == 0:
# choose a key at semi-random
ak = lm.keys()
dist = lm[ak[random_seed % len(ak)]]
dist = lm[alt_keys[0]]
return dist
if history is None or history == "<START>":
start_token = ["~"] * order
start_token = history
if len(start_token) != order:
raise ValueError("Start length must match order setting of {}! {} is length {}".format(order, history, len(history)))
if end is None:
end_token = "<EOS>"
end_token = end
random_state = np.random.RandomState(random_seed)
b = beamsearch(pf, beam_width, start_token=start_token,
# it is a generator but do this so that function prototypes are consistent
all_r = []
for r in b:
all_r.append((r[0], r[1], r[2]))
# reorder so final scoring is matched (rather than completion order)
all_r = sorted(all_r, key=lambda x: x[1])
returns = []
for r in all_r:
s_r = "".join(r[0])
if verbose:
s_r = s_r + "\nScore: {}".format(r[1]) + "\nProbability: {}".format(r[2])
# return list of all beams, ordered by score
return returns
if __name__ == "__main__":
default_order = 6
default_temperature = 1.0
default_beamwidth = 10
default_start = "<START>"
default_end = "<EOS>"
default_beamwidth = 10
default_decoder = 0
default_randomseed = 1999
default_maxlength = 500
default_cache = 1
default_print = 1
default_verbose = 1
# TODO: Faster cache
parser = argparse.ArgumentParser(description="A Markov chain character level language model with beamsearch decoding",
epilog="Simple usage:\n python shakespeare_input.txt -o 10\nFull usage:\n python shakespeare_input.txt -o 10 -d 0 -s 'HOLOFERNES' -e 'crew?\\n' -r 2177",
parser.add_argument("filepath", help="Path to file to use for language modeling. For an example file, try downloading\n", default=None)
parser.add_argument("-o", "--order", help="Markov chain order, higher will make better text but takes longer to process.\nDefault {}".format(default_order), default=default_order)
parser.add_argument("-t", "--temperature", help="Temperature for Markov chain softmax, higher is more random, lower more static.\nDefault {}".format(default_temperature), default=default_temperature)
parser.add_argument("-d","--decoder", help="Decoder for Markov chain, 0 is stochastic beamsearch, 1 is argmax beamsearch, 2 is sampled next-step, 3 is argmax next-step.\nDefault {}".format(default_decoder), default=default_decoder)
parser.add_argument("-b", "--beamwidth", help="Beamwidth to use for beamsearch.\nDefault {}".format(default_beamwidth), default=default_beamwidth)
parser.add_argument("-r", "--randomseed", help="Random seed to initialize randomness.\nDefault {}".format(default_randomseed), default=default_randomseed)
parser.add_argument("-s", "--starttoken", help="Start sequence token. Can be a string such as 'hello\\n', extra padding will be inferred from the data.\nDefault {}".format(default_start), default=default_start)
parser.add_argument("-e", "--endtoken", help="Random seed to initialize randomness. Can be a string such as 'goodbye\\n'.\nDefault {}".format(default_end), default=default_end)
parser.add_argument("-m", "--maxlength", help="Max generation length.\nDefault {}".format(default_maxlength), default=default_maxlength)
parser.add_argument("-c", "--cache", help="Whether to cache models for faster use.\nDefault {}".format(default_cache), default=default_cache)
parser.add_argument("-a", "--allbeams", help="Print all beams for beamsearch, 0 for top only, 1 for all.\nDefault {}".format(default_print), default=default_print)
parser.add_argument("-v", "--verbose", help="Print the score and probability for beams.\nDefault {}".format(default_verbose), default=default_verbose)
args = parser.parse_args()
if args.filepath is None:
raise ValueError("No text filepath provided!")
fpath = args.filepath
if not os.path.exists(fpath):
raise ValueError("Unable to find file at %s" % fpath)
decoder_settings = [0, 1, 2, 3]
decoder = int(args.decoder)
# TODO: gumbel-max in stochastic beam decoder...?
beam_width = int(args.beamwidth)
temperature = float(args.temperature)
random_seed = int(args.randomseed)
maxlength = int(args.maxlength)
allbeams = int(args.allbeams)
verbose = int(args.verbose)
order = int(args.order)
if order < 1:
raise ValueError("Order must be greater than 1! Was set to {}".format(order))
cache = int(args.cache)
if cache not in [0, 1]:
raise ValueError("Cache must be either 0 (no save) or 1 (save)! Was set to {}".format(cache))
start_token = args.starttoken.decode("string_escape")
if start_token != default_start:
user_start_token = True
user_start_token = False
end_token = args.endtoken.decode("string_escape")
if end_token != default_end:
user_end_token = True
user_end_token = False
if decoder == 0:
# stochastic beam
stochastic = True
decode_fun = beam_text
type_tag = "Stochastic beam search, beam width {}, Markov order {}, temperature {}, seed {}".format(beam_width, order, temperature, random_seed)
elif decoder == 1:
# argmax beam
stochastic = False
decode_fun = beam_text
type_tag = "Argmax beam search, beam width {}, Markov order {}".format(beam_width, order)
elif decoder == 2:
# stochastic next-step
stochastic = True
decode_fun = step_text
type_tag = "Stochastic next step, Markov order {}, temperature {}, seed {}".format(order, temperature, random_seed)
elif decoder == 3:
# argmax next-step
stochastic = False
decode_fun = step_text
type_tag = "Argmax next step, Markov order {}".format(order)
raise ValueError("Decoder must be 0, 1, 2, or 3! Was set to {}".format(decoder))
# only things that affect the language model are training data, temperature, order
cached_name = "model_{}_t{}_o{}.pkl".format("".join(fpath.split(".")[:-1]), str(temperature).replace(".", "pt"), order)
if cache == 1 and os.path.exists(cached_name):
print("Found cached model at {}, loading...".format(cached_name))
start_time = time.time()
with open(cached_name, "rb") as f:
lm = pickle.load(f)
# codec troubles :(
with open(cached_name, "r") as f:
lm = json.load(f, encoding="latin1")
stop_time = time.time()
print("Time to load: {} s".format(stop_time - start_time))
start_time = time.time()
lm = train_char_lm(fpath, order=order,
stop_time = time.time()
print("Time to train: {} s".format(stop_time - start_time))
if cache == 1:
print("Caching model now...")
with open(cached_name, "wb") as f:
pickle.dump(lm, f)
# codec troubles :(
with open(cached_name, "w") as f:
json.dump(lm, f, encoding="latin1")
print("Caching complete!")
# All this logic to handle/match different start keys
rs = np.random.RandomState(random_seed)
if user_start_token:
if len(start_token) > order:
start_token = start_token[-order:]
print("WARNING: specified start token larger than order, truncating to\n{}".format(start_token))
if len(start_token) <= order:
matching_keys = [k for k in lm.keys() if k.endswith(start_token)]
all_keys = [k for k in lm.keys()]
while True:
if len(matching_keys) == 0:
print("No matching key for `{}` in language model!".format(start_token))
print("Please enter another one (suggestions in backticks)\n`{}`\n`{}`\n`{}`)".format(all_keys[0], all_keys[1], all_keys[2]))
line = raw_input('Prompt ("Ctrl-C" to quit): ')
line = line.strip()
if len(line) == 0:
start_token = line
matching_keys = [k for k in lm.keys() if k.endswith(start_token)]
if len(start_token) < order:
# choose key at random
matching_keys = [k for k in lm.keys() if k.endswith(start_token)]
start_token = matching_keys[0]
print("WARNING: start key shorter than order, set to\n`{}`".format(start_token))
start_token = list(start_token)
if user_end_token:
end_token = list(end_token)
if allbeams == 0:
return_count = 1
elif allbeams == 1:
raise ValueError("Unknown setting for allbeams {}".format(allbeams))
if verbose == 0:
verbose = False
elif verbose == 1:
verbose = True
raise ValueError("Unknown setting for verbose {}".format(verbose))
start_time = time.time()
all_o = decode_fun(lm, order, stochastic, random_seed, history=start_token,
end=end_token, beam_width=beam_width,
n_letters=maxlength, verbose=verbose)
stop_time = time.time()
print("Time to decode: {} s".format(stop_time - start_time))
if allbeams == 0:
all_o = [all_o[0]]
for n, oi in enumerate(all_o):
if len(all_o) > 1:
if n == 0:
print("BEAM {} (worst score)".format(n + 1))
elif n != (len(all_o) - 1):
print("BEAM {}".format(n + 1))
print("BEAM {} (best score)".format(n + 1))
if user_start_token:
print("".join(start_token) + oi)
