Created
February 19, 2020 18:56
-
-
Save decretist/64b9188c45f7a935078492e1da4eebc7 to your computer and use it in GitHub Desktop.
Zipf's law demonstration, comparing the theoretical 1/N tail-off with actual data from the 20 MFWs from Gratian's dicta
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python3 | |
# | |
# Paul Evans ([email protected] | |
# 13-16 February 2020 | |
# | |
import math | |
import matplotlib.pyplot as pp | |
import statistics | |
def regression_slope(data_points): | |
n = len(data_points) | |
x_values, y_values = zip(*data_points) | |
x_bar = statistics.mean(x_values) | |
y_bar = statistics.mean(y_values) | |
xy_sum = 0 | |
x_squared_sum = 0 | |
for i in range(n): | |
xy_sum += x_values[i] * y_values[i] | |
x_squared_sum += x_values[i] ** 2 | |
return (xy_sum - n * x_bar * y_bar) / (x_squared_sum - n * x_bar ** 2) | |
def plot_regression(data_points): | |
x_values, y_values = zip(*data_points) | |
x_bar = statistics.mean(x_values) | |
y_bar = statistics.mean(y_values) | |
slope = regression_slope(data_points) | |
x_values = [0, max(x_values)] | |
y_values = [] | |
for x in x_values: | |
y_values.append(y_bar + slope * (x - x_bar)) | |
pp.plot(x_values, y_values) | |
return slope | |
def plot_data_bar(data_points): | |
x_values, y_values = zip(*data_points) | |
pp.bar(x_values, y_values) | |
pp.xticks([1, 5, 10, 15, 20]) | |
pp.xlabel('rank') | |
pp.ylabel('word count') | |
def plot_data_scatter(data_points): | |
x_values, y_values = zip(*data_points) | |
pp.scatter(x_values, y_values) | |
pp.xlabel('$log_{e}$ rank') | |
pp.ylabel('$log_{e}$ word count') | |
def logify(data_points): | |
x_tmp, y_tmp = zip(*data_points) | |
x_values = list(x_tmp) | |
y_values = list(y_tmp) | |
x_log = [math.log(x) for x in x_values] | |
y_log = [math.log(y) for y in y_values] | |
return list(zip(x_log, y_log)) | |
def main(): | |
# theoretical Zipf distribution for 20 MFWs (bar plot) | |
x_tmp = [x for x in range(1, 21)] | |
y_tmp = [(1 / x) * 1861 for x in x_tmp] | |
plot_data_bar(list(zip(x_tmp, y_tmp))) | |
pp.title('theoretical Zipf distribution for 20 MFWs') | |
pp.savefig('PNGs/Figure_Z_theoretical_bar.png') | |
pp.show() | |
# theoretical Zipf distribution for 20 MFWs (log-log scatter plot) | |
plot_data_scatter(logify(list(zip(x_tmp, y_tmp)))) | |
slope = plot_regression(logify(list(zip(x_tmp, y_tmp)))) | |
pp.title(f'theoretical Zipf distribution for 20 MFWs\n(log-log, slope = {slope:.4f})') | |
pp.savefig('PNGs/Figure_Z_theoretical_log-log_scatter.png') | |
pp.show() | |
# actual distribution for 20 MFWs from R1 and R2 dicta (bar plot) | |
data_points = [(1, 1861), (2, 1666), (3, 1638), (4, 1132), (5, 784), (6, 768), (7, 691), (8, 681), (9, 677), (10, 661), (11, 631), (12, 534), (13, 530), (14, 518), (15, 510), (16, 473), (17, 418), (18, 379), (19, 372), (20, 357)] | |
plot_data_bar(data_points) | |
pp.title('actual distribution for 20 MFWs from R1 and R2 $\it{dicta}$') | |
pp.savefig('PNGs/Figure_Z_actual_bar.png') | |
pp.show() | |
# actual distribution for 20 MFWs from R1 and R2 dicta (log-log scatter plot) | |
plot_data_scatter(logify(data_points)) | |
slope = plot_regression(logify(data_points)) | |
pp.title('actual distribution for 20 MFWs from R1 and R2 $\it{dicta}$\n(log-log, slope = ' + f'{slope:.4f})') | |
pp.savefig('PNGs/Figure_Z_actual_log-log_scatter.png') | |
pp.show() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment