Created
December 22, 2017 07:43
-
-
Save jkff/dbbf3bb139c14407aaf7736b7e81797d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pickle | |
import random | |
from scipy.optimize import curve_fit | |
def shuffled(x): | |
res = list(x) | |
random.shuffle(res) | |
return res | |
def fit_heaps_law(phrases): | |
num_unique_after_phrase = [0] | |
num_unique_after_word = [0] | |
unique_words = set() | |
for phrase in shuffled(phrases): | |
for word in phrase: | |
unique_words.add(word) | |
num_unique_after_word.append(len(unique_words)) | |
num_unique_after_phrase.append(len(unique_words)) | |
def f(n, k, b): | |
return k * n ** b | |
def fit(data): | |
def condense(series): | |
# 1000 points ought to be enough | |
factor = max(1, len(series) / 1000) | |
return [x for i, x in enumerate(series) if i % factor == 0] | |
((k, b), _) = curve_fit( | |
f, | |
xdata=condense(range(0, len(data))), | |
ydata=condense(data), | |
bounds=((0, 0), (len(data), 1))) | |
return k, b, (lambda n: f(n, k, b)) | |
return fit(num_unique_after_word), fit(num_unique_after_phrase) | |
phrases = pickle.load(open('movie_lines_cleaned.p')) | |
word_fit, phrase_fit = fit_heaps_law(phrases) | |
print word_fit | |
print phrase_fit | |
def fit_phrase_length(phrases): | |
lens = [len(phrase) for phrase in phrases] | |
len_to_counts = [0 for i in range(1 + max(lens))] | |
for n in lens: | |
len_to_counts[n] += 1 | |
def pdf(w, a, b, c): | |
return a * (w**b) * (c**w) | |
((a, b, c), _) = curve_fit( | |
pdf, | |
xdata=range(0, len(len_to_counts)), | |
ydata=len_to_counts) | |
# We performed the fit on counts rather than frequencies: for some reason, | |
# it gives a much better fit, but we need to scale 'a' back. | |
s = sum(len_to_counts) | |
a = a / s | |
return a, b, c, lambda w: pdf(w, a, b, c), max(lens) | |
def p_phrase_has_new_word(omega, phrase_length_fit): | |
a, b, c, pdf, max_len = phrase_length_fit | |
return sum( | |
pdf(i) * (1 - (1 - omega)**i) | |
for i in range(1, 1 + max_len)) | |
phrase_length_fit = fit_phrase_length(phrases) | |
k, beta, _ = word_fit | |
sample = shuffled(phrases)[1:1000] | |
w = len([word for phrase in sample for word in phrase]) | |
omega = k * beta * w ** (beta - 1) | |
print w, omega, p_phrase_has_new_word(omega, phrase_length_fit) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment