Steboss89 · April 12, 2022 20:33
diff --git a/zipf_law.py b/zipf_law.py
 # preprocessing on data
 # data is a list of all the Bible's books 

 # call the CountVectorizer
 cvec = CountVectorizer()
 # fit transform as we're working directly on all the corpus
 cvec.fit_transform(data)
 # np matrix sparse
 all_df = cvec.transform(data)
 # create a dataframe: sum on all the term occurrences
 tf = np.sum(all_df,axis=0)
 # remove an axis from the tf
 tf2 = np.squeeze(np.asarray(tf))
 # thus we can transform it as a Dataframe
 term_freq_df = pd.DataFrame([tf2],columns=cvec.get_feature_names()).transpose()
 # create the plot
 # 0 is the counts
 counts = term_freq_df[0]
 # index the words
 tokens = term_freq_df.index
 # ranks is the position of the word 
 ranks = np.arange(1, len(counts)+1)
 indices = np.argsort(-counts)
 # grab the frequencies
 frequencies = counts[indices]
 # plot figure
 plt.figure(figsize=(15,15))
 # set limits
 plt.ylim(1,10**4.1)
 plt.xlim(1,10**4.1)
 # log log plot
 plt.loglog(ranks, frequencies, marker=".")
 # draw a line to highligh zipf's expected behaviour
 plt.plot([1,frequencies[0]],[frequencies[0],1],color='r')
 plt.xlabel("Frequency rank of token", fontsize=20)
 plt.ylabel("Absolute frequency of token", fontsize=20)
 plt.xticks(fontsize=16)
 plt.yticks(fontsize=16)
 plt.grid(True)
 # add the text
 for n in list(np.logspace(-0.5, np.log10(len(counts)-2), 25).astype(int)):
    dummy = plt.text(ranks[n], frequencies[n],
                     " " + tokens[indices[n]],
                     verticalalignment="bottom",
                     horizontalalignment="left",
                     fontsize=20)
	# preprocessing on data
	# data is a list of all the Bible's books

	# call the CountVectorizer
	cvec = CountVectorizer()
	# fit transform as we're working directly on all the corpus
	cvec.fit_transform(data)
	# np matrix sparse
	all_df = cvec.transform(data)
	# create a dataframe: sum on all the term occurrences
	tf = np.sum(all_df,axis=0)
	# remove an axis from the tf
	tf2 = np.squeeze(np.asarray(tf))
	# thus we can transform it as a Dataframe
	term_freq_df = pd.DataFrame([tf2],columns=cvec.get_feature_names()).transpose()
	# create the plot
	# 0 is the counts
	counts = term_freq_df[0]
	# index the words
	tokens = term_freq_df.index
	# ranks is the position of the word
	ranks = np.arange(1, len(counts)+1)
	indices = np.argsort(-counts)
	# grab the frequencies
	frequencies = counts[indices]
	# plot figure
	plt.figure(figsize=(15,15))
	# set limits
	plt.ylim(1,10**4.1)
	plt.xlim(1,10**4.1)
	# log log plot
	plt.loglog(ranks, frequencies, marker=".")
	# draw a line to highligh zipf's expected behaviour
	plt.plot([1,frequencies[0]],[frequencies[0],1],color='r')
	plt.xlabel("Frequency rank of token", fontsize=20)
	plt.ylabel("Absolute frequency of token", fontsize=20)
	plt.xticks(fontsize=16)
	plt.yticks(fontsize=16)
	plt.grid(True)
	# add the text
	for n in list(np.logspace(-0.5, np.log10(len(counts)-2), 25).astype(int)):
	dummy = plt.text(ranks[n], frequencies[n],
	" " + tokens[indices[n]],
	verticalalignment="bottom",
	horizontalalignment="left",
	fontsize=20)