Skip to content

Instantly share code, notes, and snippets.

@decretist
Created February 19, 2020 18:56
Show Gist options
  • Save decretist/64b9188c45f7a935078492e1da4eebc7 to your computer and use it in GitHub Desktop.
Save decretist/64b9188c45f7a935078492e1da4eebc7 to your computer and use it in GitHub Desktop.
Zipf's law demonstration, comparing the theoretical 1/N tail-off with actual data from the 20 MFWs from Gratian's dicta
#!/usr/local/bin/python3
#
# Paul Evans ([email protected]
# 13-16 February 2020
#
import math
import matplotlib.pyplot as pp
import statistics
def regression_slope(data_points):
n = len(data_points)
x_values, y_values = zip(*data_points)
x_bar = statistics.mean(x_values)
y_bar = statistics.mean(y_values)
xy_sum = 0
x_squared_sum = 0
for i in range(n):
xy_sum += x_values[i] * y_values[i]
x_squared_sum += x_values[i] ** 2
return (xy_sum - n * x_bar * y_bar) / (x_squared_sum - n * x_bar ** 2)
def plot_regression(data_points):
x_values, y_values = zip(*data_points)
x_bar = statistics.mean(x_values)
y_bar = statistics.mean(y_values)
slope = regression_slope(data_points)
x_values = [0, max(x_values)]
y_values = []
for x in x_values:
y_values.append(y_bar + slope * (x - x_bar))
pp.plot(x_values, y_values)
return slope
def plot_data_bar(data_points):
x_values, y_values = zip(*data_points)
pp.bar(x_values, y_values)
pp.xticks([1, 5, 10, 15, 20])
pp.xlabel('rank')
pp.ylabel('word count')
def plot_data_scatter(data_points):
x_values, y_values = zip(*data_points)
pp.scatter(x_values, y_values)
pp.xlabel('$log_{e}$ rank')
pp.ylabel('$log_{e}$ word count')
def logify(data_points):
x_tmp, y_tmp = zip(*data_points)
x_values = list(x_tmp)
y_values = list(y_tmp)
x_log = [math.log(x) for x in x_values]
y_log = [math.log(y) for y in y_values]
return list(zip(x_log, y_log))
def main():
# theoretical Zipf distribution for 20 MFWs (bar plot)
x_tmp = [x for x in range(1, 21)]
y_tmp = [(1 / x) * 1861 for x in x_tmp]
plot_data_bar(list(zip(x_tmp, y_tmp)))
pp.title('theoretical Zipf distribution for 20 MFWs')
pp.savefig('PNGs/Figure_Z_theoretical_bar.png')
pp.show()
# theoretical Zipf distribution for 20 MFWs (log-log scatter plot)
plot_data_scatter(logify(list(zip(x_tmp, y_tmp))))
slope = plot_regression(logify(list(zip(x_tmp, y_tmp))))
pp.title(f'theoretical Zipf distribution for 20 MFWs\n(log-log, slope = {slope:.4f})')
pp.savefig('PNGs/Figure_Z_theoretical_log-log_scatter.png')
pp.show()
# actual distribution for 20 MFWs from R1 and R2 dicta (bar plot)
data_points = [(1, 1861), (2, 1666), (3, 1638), (4, 1132), (5, 784), (6, 768), (7, 691), (8, 681), (9, 677), (10, 661), (11, 631), (12, 534), (13, 530), (14, 518), (15, 510), (16, 473), (17, 418), (18, 379), (19, 372), (20, 357)]
plot_data_bar(data_points)
pp.title('actual distribution for 20 MFWs from R1 and R2 $\it{dicta}$')
pp.savefig('PNGs/Figure_Z_actual_bar.png')
pp.show()
# actual distribution for 20 MFWs from R1 and R2 dicta (log-log scatter plot)
plot_data_scatter(logify(data_points))
slope = plot_regression(logify(data_points))
pp.title('actual distribution for 20 MFWs from R1 and R2 $\it{dicta}$\n(log-log, slope = ' + f'{slope:.4f})')
pp.savefig('PNGs/Figure_Z_actual_log-log_scatter.png')
pp.show()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment