Skip to content

Instantly share code, notes, and snippets.

@Steboss89
Created April 12, 2022 20:33
Show Gist options
  • Save Steboss89/effc0b12f4b8241c9360d2f524a7e53b to your computer and use it in GitHub Desktop.
Save Steboss89/effc0b12f4b8241c9360d2f524a7e53b to your computer and use it in GitHub Desktop.
Compute Zipf's law
# preprocessing on data
# data is a list of all the Bible's books
# call the CountVectorizer
cvec = CountVectorizer()
# fit transform as we're working directly on all the corpus
cvec.fit_transform(data)
# np matrix sparse
all_df = cvec.transform(data)
# create a dataframe: sum on all the term occurrences
tf = np.sum(all_df,axis=0)
# remove an axis from the tf
tf2 = np.squeeze(np.asarray(tf))
# thus we can transform it as a Dataframe
term_freq_df = pd.DataFrame([tf2],columns=cvec.get_feature_names()).transpose()
# create the plot
# 0 is the counts
counts = term_freq_df[0]
# index the words
tokens = term_freq_df.index
# ranks is the position of the word
ranks = np.arange(1, len(counts)+1)
indices = np.argsort(-counts)
# grab the frequencies
frequencies = counts[indices]
# plot figure
plt.figure(figsize=(15,15))
# set limits
plt.ylim(1,10**4.1)
plt.xlim(1,10**4.1)
# log log plot
plt.loglog(ranks, frequencies, marker=".")
# draw a line to highligh zipf's expected behaviour
plt.plot([1,frequencies[0]],[frequencies[0],1],color='r')
plt.xlabel("Frequency rank of token", fontsize=20)
plt.ylabel("Absolute frequency of token", fontsize=20)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
# add the text
for n in list(np.logspace(-0.5, np.log10(len(counts)-2), 25).astype(int)):
dummy = plt.text(ranks[n], frequencies[n],
" " + tokens[indices[n]],
verticalalignment="bottom",
horizontalalignment="left",
fontsize=20)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment