Created
February 19, 2020 19:22
-
-
Save decretist/429e8d60b3d87d22f1d0a0e7452fa21d to your computer and use it in GitHub Desktop.
Simplified 2-dimensional demo of word frequency variation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python3 | |
# | |
# Paul Evans ([email protected] | |
# 6-11 February 2020 | |
# 22 January 2020 | |
# | |
import argparse | |
import math | |
import matplotlib.pyplot as pp | |
import statistics | |
def pstdev(data, **kwargs): | |
'''temporary replacement for statistics.pstdev()''' | |
mu = None | |
if 'mu' in kwargs: mu = kwargs['mu'] # type check: int, float, or None | |
if mu == None: mu = statistics.mean(data) | |
sum = 0 | |
for i in range(len(data)): | |
sum += (data[i] - mu) ** 2 | |
return(math.sqrt(sum / len(data))) | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-f', '--frequency_view', action='store_true') | |
parser.add_argument('-p', '--de_pen', action='store_true') | |
args = parser.parse_args() | |
if args.de_pen: de_pen = True | |
else: de_pen = False | |
if args.frequency_view: frequency_view = True | |
else: frequency_view = False | |
if de_pen: | |
# number of words in R1 and R2 dicta | |
words_r1 = 66238 | |
words_r2 = 14811 | |
# number of occurrences of 'in' and 'non' in R1 and R2 dicta | |
occurrences_in_r1 = 1682 | |
occurrences_in_r2 = 431 | |
occurrences_non_r1 = 1622 | |
occurrences_non_r2 = 314 | |
else: | |
# number of words in R1 and R2 dicta | |
words_r1 = 56713 | |
words_r2 = 14255 | |
# number of occurrences of 'in' and 'non' in R1 and R2 dicta | |
occurrences_in_r1 = 1450 | |
occurrences_in_r2 = 411 | |
occurrences_non_r1 = 1360 | |
occurrences_non_r2 = 306 | |
# frequency of occurrence of 'in' per 1000 words | |
frequency_in_r1 = (occurrences_in_r1 / words_r1) * 1000 | |
frequency_in_r2 = (occurrences_in_r2 / words_r2) * 1000 | |
frequency_in_values = [frequency_in_r1, frequency_in_r2] | |
frequency_in_mean = ((occurrences_in_r1 + occurrences_in_r2) / (words_r1 + words_r2)) * 1000 | |
# standard_deviation_in = statistics.pstdev(frequency_in_values, mu=frequency_in_mean) | |
standard_deviation_in = pstdev(frequency_in_values, mu=frequency_in_mean) | |
percentage_in_r1 = ((frequency_in_r1 - frequency_in_mean) / frequency_in_mean) * 100 | |
percentage_in_r2 = ((frequency_in_r2 - frequency_in_mean) / frequency_in_mean) * 100 | |
if de_pen: print('(including de Pen.)') | |
else: print('(excluding de Pen.)') | |
string_in = 'occurrences of \'in\' per 1,000 words' | |
print(f'{frequency_in_r1:7.4f} {string_in} (R1)') | |
print(f'{frequency_in_r2:7.4f} {string_in} (R2)') | |
print(f'{frequency_in_mean:7.4f} {string_in} (mean)') | |
print(f'{standard_deviation_in:7.4f} {string_in} (standard deviation)') | |
if percentage_in_r1 > 0: more_or_less = 'more' | |
else: more_or_less = 'less' | |
print(f'\'in\' occurs {abs(percentage_in_r1):.2f}% {more_or_less} frequently in R1 than in mean') | |
if percentage_in_r2 > 0: more_or_less = 'more' | |
else: more_or_less = 'less' | |
print(f'\'in\' occurs {abs(percentage_in_r2):.2f}% {more_or_less} frequently in R2 than in mean') | |
# frequency of occurrence of 'non' per 1000 words | |
frequency_non_r1 = (occurrences_non_r1 / words_r1) * 1000 | |
frequency_non_r2 = (occurrences_non_r2 / words_r2) * 1000 | |
frequency_non_values = [frequency_non_r1, frequency_non_r2] | |
frequency_non_mean = ((occurrences_non_r1 + occurrences_non_r2) / (words_r1 + words_r2)) * 1000 | |
# standard_deviation_non = statistics.pstdev(frequency_non_values, mu=frequency_non_mean) | |
standard_deviation_non = pstdev(frequency_non_values, mu=frequency_non_mean) | |
percentage_non_r1 = ((frequency_non_r1 - frequency_non_mean) / frequency_non_mean) * 100 | |
percentage_non_r2 = ((frequency_non_r2 - frequency_non_mean) / frequency_non_mean) * 100 | |
string_non = 'occurrences of \'non\' per 1,000 words' | |
print(f'{frequency_non_r1:7.4f} {string_non} (R1)') | |
print(f'{frequency_non_r2:7.4f} {string_non} (R2)') | |
print(f'{frequency_non_mean:7.4f} {string_non} (mean)') | |
print(f'{standard_deviation_non:7.4f} {string_non} (standard deviation)') | |
if percentage_non_r1 > 0: more_or_less = 'more' | |
else: more_or_less = 'less' | |
print(f'\'non\' occurs {abs(percentage_non_r1):.2f}% {more_or_less} frequently in R1 than in mean') | |
if percentage_non_r2 > 0: more_or_less = 'more' | |
else: more_or_less = 'less' | |
print(f'\'non\' occurs {abs(percentage_non_r2):.2f}% {more_or_less} frequently in R2 than in mean') | |
if frequency_view: | |
value_in_r1 = frequency_in_r1 | |
value_non_r1 = frequency_non_r1 | |
value_in_r2 = frequency_in_r2 | |
value_non_r2 = frequency_non_r2 | |
pp.axis([frequency_in_mean - 2 * standard_deviation_in, | |
frequency_in_mean + 2 * standard_deviation_in, | |
frequency_non_mean - 2 * standard_deviation_non, | |
frequency_non_mean + 2 * standard_deviation_non]) | |
pp.axhline(frequency_non_mean,linestyle='dashed') | |
pp.axvline(frequency_in_mean, linestyle='dashed') | |
pp.xlabel('frequency of occurrence of $\it{in}$ per 1,000 words') | |
pp.ylabel('frequency of occurrence of $\it{non}$ per 1,000 words') | |
else: #standard deviation view | |
value_in_r1 = (frequency_in_r1 - frequency_in_mean) / standard_deviation_in | |
value_non_r1 = (frequency_non_r1 - frequency_non_mean) / standard_deviation_non | |
value_in_r2 = (frequency_in_r2 - frequency_in_mean) / standard_deviation_in | |
value_non_r2 = (frequency_non_r2 - frequency_non_mean) / standard_deviation_non | |
pp.axis([-2, 2, -2, 2]) | |
pp.axhline(linestyle='dashed') | |
pp.axvline(linestyle='dashed') | |
pp.xticks([-1, 0, 1]) | |
pp.yticks([-1, 0, 1]) | |
pp.xlabel('$\it{in}$') | |
pp.ylabel('$\it{non}$', rotation='horizontal') | |
pp.annotate( | |
f'R1 ({value_in_r1:.3f}, {value_non_r1:.3f})', | |
(value_in_r1, value_non_r1), | |
textcoords="offset points", | |
xytext=(0,10), | |
ha='center' | |
) | |
pp.annotate( | |
f'R2 ({value_in_r2:.3f}, {value_non_r2:.3f})', | |
(value_in_r2, value_non_r2), | |
textcoords="offset points", | |
xytext=(0,10), | |
ha='center' | |
) | |
in_values = [value_in_r1, value_in_r2] | |
non_values = [value_non_r1, value_non_r2] | |
pp.scatter(in_values, non_values) | |
if frequency_view: | |
title_string = '(frequency view, ' | |
filename = './PNGs/Figure_0_frequency_' | |
else: | |
title_string = '(z-score view, ' | |
filename = './PNGs/Figure_0_z-score_' | |
if de_pen: | |
title_string += 'including $\it{de Pen.}$)' | |
filename += 'including_de_Pen.png' | |
else: | |
title_string += 'excluding $\it{de Pen.}$)' | |
filename += 'excluding_de_Pen.png' | |
pp.title(title_string) | |
pp.savefig(filename) | |
pp.gcf().canvas.set_window_title('Figure 0') | |
pp.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment