decretist · February 19, 2020 19:22
diff --git a/visualize.py b/visualize.py
 #!/usr/local/bin/python3
 #
 # Paul Evans ([email protected]
 # 6-11 February 2020
 # 22 January 2020
 #
 import argparse
 import math
 import matplotlib.pyplot as pp
 import statistics

 def pstdev(data, **kwargs):
    '''temporary replacement for statistics.pstdev()'''
    mu = None
    if 'mu' in kwargs: mu = kwargs['mu'] # type check: int, float, or None
    if mu == None: mu = statistics.mean(data)
    sum = 0
    for i in range(len(data)):
        sum += (data[i] - mu) ** 2
    return(math.sqrt(sum / len(data)))

 parser = argparse.ArgumentParser()
 parser.add_argument('-f', '--frequency_view', action='store_true')
 parser.add_argument('-p', '--de_pen', action='store_true')
 args = parser.parse_args()
 if args.de_pen: de_pen = True
 else: de_pen = False
 if args.frequency_view: frequency_view = True
 else: frequency_view = False

 if de_pen:
    # number of words in R1 and R2 dicta
    words_r1 = 66238
    words_r2 = 14811
    # number of occurrences of 'in' and 'non' in R1 and R2 dicta
    occurrences_in_r1 = 1682
    occurrences_in_r2 = 431
    occurrences_non_r1 = 1622
    occurrences_non_r2 = 314
 else:
    # number of words in R1 and R2 dicta
    words_r1 = 56713
    words_r2 = 14255
    # number of occurrences of 'in' and 'non' in R1 and R2 dicta
    occurrences_in_r1 = 1450
    occurrences_in_r2 = 411
    occurrences_non_r1 = 1360
    occurrences_non_r2 = 306

 # frequency of occurrence of 'in' per 1000 words
 frequency_in_r1 = (occurrences_in_r1 / words_r1) * 1000
 frequency_in_r2 = (occurrences_in_r2 / words_r2) * 1000
 frequency_in_values = [frequency_in_r1, frequency_in_r2]
 frequency_in_mean = ((occurrences_in_r1 + occurrences_in_r2) / (words_r1 + words_r2)) * 1000
 # standard_deviation_in = statistics.pstdev(frequency_in_values, mu=frequency_in_mean)
 standard_deviation_in = pstdev(frequency_in_values, mu=frequency_in_mean)
 percentage_in_r1 = ((frequency_in_r1 - frequency_in_mean) / frequency_in_mean) * 100
 percentage_in_r2 = ((frequency_in_r2 - frequency_in_mean) / frequency_in_mean) * 100
 if de_pen: print('(including de Pen.)')
 else: print('(excluding de Pen.)')
 string_in = 'occurrences of  \'in\' per 1,000 words'
 print(f'{frequency_in_r1:7.4f} {string_in} (R1)')
 print(f'{frequency_in_r2:7.4f} {string_in} (R2)')
 print(f'{frequency_in_mean:7.4f} {string_in} (mean)')
 print(f'{standard_deviation_in:7.4f} {string_in} (standard deviation)')
 if percentage_in_r1 > 0: more_or_less = 'more'
 else: more_or_less = 'less'
 print(f'\'in\' occurs {abs(percentage_in_r1):.2f}% {more_or_less} frequently in R1 than in mean')
 if percentage_in_r2 > 0: more_or_less = 'more'
 else: more_or_less = 'less'
 print(f'\'in\' occurs {abs(percentage_in_r2):.2f}% {more_or_less} frequently in R2 than in mean')

 # frequency of occurrence of 'non' per 1000 words
 frequency_non_r1 = (occurrences_non_r1 / words_r1) * 1000
 frequency_non_r2 = (occurrences_non_r2 / words_r2) * 1000
 frequency_non_values = [frequency_non_r1, frequency_non_r2]
 frequency_non_mean = ((occurrences_non_r1 + occurrences_non_r2) / (words_r1 + words_r2)) * 1000
 # standard_deviation_non = statistics.pstdev(frequency_non_values, mu=frequency_non_mean)
 standard_deviation_non = pstdev(frequency_non_values, mu=frequency_non_mean)
 percentage_non_r1 = ((frequency_non_r1 - frequency_non_mean) / frequency_non_mean) * 100
 percentage_non_r2 = ((frequency_non_r2 - frequency_non_mean) / frequency_non_mean) * 100
 string_non = 'occurrences of \'non\' per 1,000 words'
 print(f'{frequency_non_r1:7.4f} {string_non} (R1)')
 print(f'{frequency_non_r2:7.4f} {string_non} (R2)')
 print(f'{frequency_non_mean:7.4f} {string_non} (mean)')
 print(f'{standard_deviation_non:7.4f} {string_non} (standard deviation)')
 if percentage_non_r1 > 0: more_or_less = 'more'
 else: more_or_less = 'less'
 print(f'\'non\' occurs {abs(percentage_non_r1):.2f}% {more_or_less} frequently in R1 than in mean')
 if percentage_non_r2 > 0: more_or_less = 'more'
 else: more_or_less = 'less'
 print(f'\'non\' occurs {abs(percentage_non_r2):.2f}% {more_or_less} frequently in R2 than in mean')

 if frequency_view:
    value_in_r1 = frequency_in_r1
    value_non_r1 = frequency_non_r1
    value_in_r2 = frequency_in_r2
    value_non_r2 = frequency_non_r2
    pp.axis([frequency_in_mean - 2 * standard_deviation_in,
        frequency_in_mean + 2 * standard_deviation_in,
        frequency_non_mean - 2 * standard_deviation_non,
        frequency_non_mean + 2 * standard_deviation_non])
    pp.axhline(frequency_non_mean,linestyle='dashed')
    pp.axvline(frequency_in_mean, linestyle='dashed')
    pp.xlabel('frequency of occurrence of $\it{in}$ per 1,000 words')
    pp.ylabel('frequency of occurrence of $\it{non}$ per 1,000 words')
 else: #standard deviation view
    value_in_r1 = (frequency_in_r1 - frequency_in_mean) / standard_deviation_in
    value_non_r1 = (frequency_non_r1 - frequency_non_mean) / standard_deviation_non
    value_in_r2 = (frequency_in_r2 - frequency_in_mean) / standard_deviation_in
    value_non_r2 = (frequency_non_r2 - frequency_non_mean) / standard_deviation_non
    pp.axis([-2, 2, -2, 2])
    pp.axhline(linestyle='dashed')
    pp.axvline(linestyle='dashed')
    pp.xticks([-1, 0, 1])
    pp.yticks([-1, 0, 1])
    pp.xlabel('$\it{in}$')
    pp.ylabel('$\it{non}$', rotation='horizontal')

 pp.annotate(
    f'R1 ({value_in_r1:.3f}, {value_non_r1:.3f})',
    (value_in_r1, value_non_r1),
    textcoords="offset points",
    xytext=(0,10),
    ha='center'
 )
 pp.annotate(
    f'R2 ({value_in_r2:.3f}, {value_non_r2:.3f})',
    (value_in_r2, value_non_r2),
    textcoords="offset points",
    xytext=(0,10),
    ha='center'
 )
 in_values = [value_in_r1, value_in_r2]
 non_values = [value_non_r1, value_non_r2]
 pp.scatter(in_values, non_values)
 if frequency_view:
    title_string = '(frequency view, '
    filename = './PNGs/Figure_0_frequency_'
 else:
    title_string = '(z-score view, '
    filename = './PNGs/Figure_0_z-score_'
 if de_pen:
    title_string += 'including $\it{de Pen.}$)'
    filename += 'including_de_Pen.png'
 else:
    title_string += 'excluding $\it{de Pen.}$)'
    filename += 'excluding_de_Pen.png'
 pp.title(title_string)
 pp.savefig(filename)
 pp.gcf().canvas.set_window_title('Figure 0')
 pp.show()
	#!/usr/local/bin/python3
	#
	# Paul Evans ([email protected]
	# 6-11 February 2020
	# 22 January 2020
	#
	import argparse
	import math
	import matplotlib.pyplot as pp
	import statistics

	def pstdev(data, **kwargs):
	'''temporary replacement for statistics.pstdev()'''
	mu = None
	if 'mu' in kwargs: mu = kwargs['mu'] # type check: int, float, or None
	if mu == None: mu = statistics.mean(data)
	sum = 0
	for i in range(len(data)):
	sum += (data[i] - mu) ** 2
	return(math.sqrt(sum / len(data)))

	parser = argparse.ArgumentParser()
	parser.add_argument('-f', '--frequency_view', action='store_true')
	parser.add_argument('-p', '--de_pen', action='store_true')
	args = parser.parse_args()
	if args.de_pen: de_pen = True
	else: de_pen = False
	if args.frequency_view: frequency_view = True
	else: frequency_view = False

	if de_pen:
	# number of words in R1 and R2 dicta
	words_r1 = 66238
	words_r2 = 14811
	# number of occurrences of 'in' and 'non' in R1 and R2 dicta
	occurrences_in_r1 = 1682
	occurrences_in_r2 = 431
	occurrences_non_r1 = 1622
	occurrences_non_r2 = 314
	else:
	# number of words in R1 and R2 dicta
	words_r1 = 56713
	words_r2 = 14255
	# number of occurrences of 'in' and 'non' in R1 and R2 dicta
	occurrences_in_r1 = 1450
	occurrences_in_r2 = 411
	occurrences_non_r1 = 1360
	occurrences_non_r2 = 306

	# frequency of occurrence of 'in' per 1000 words
	frequency_in_r1 = (occurrences_in_r1 / words_r1) * 1000
	frequency_in_r2 = (occurrences_in_r2 / words_r2) * 1000
	frequency_in_values = [frequency_in_r1, frequency_in_r2]
	frequency_in_mean = ((occurrences_in_r1 + occurrences_in_r2) / (words_r1 + words_r2)) * 1000
	# standard_deviation_in = statistics.pstdev(frequency_in_values, mu=frequency_in_mean)
	standard_deviation_in = pstdev(frequency_in_values, mu=frequency_in_mean)
	percentage_in_r1 = ((frequency_in_r1 - frequency_in_mean) / frequency_in_mean) * 100
	percentage_in_r2 = ((frequency_in_r2 - frequency_in_mean) / frequency_in_mean) * 100
	if de_pen: print('(including de Pen.)')
	else: print('(excluding de Pen.)')
	string_in = 'occurrences of \'in\' per 1,000 words'
	print(f'{frequency_in_r1:7.4f} {string_in} (R1)')
	print(f'{frequency_in_r2:7.4f} {string_in} (R2)')
	print(f'{frequency_in_mean:7.4f} {string_in} (mean)')
	print(f'{standard_deviation_in:7.4f} {string_in} (standard deviation)')
	if percentage_in_r1 > 0: more_or_less = 'more'
	else: more_or_less = 'less'
	print(f'\'in\' occurs {abs(percentage_in_r1):.2f}% {more_or_less} frequently in R1 than in mean')
	if percentage_in_r2 > 0: more_or_less = 'more'
	else: more_or_less = 'less'
	print(f'\'in\' occurs {abs(percentage_in_r2):.2f}% {more_or_less} frequently in R2 than in mean')

	# frequency of occurrence of 'non' per 1000 words
	frequency_non_r1 = (occurrences_non_r1 / words_r1) * 1000
	frequency_non_r2 = (occurrences_non_r2 / words_r2) * 1000
	frequency_non_values = [frequency_non_r1, frequency_non_r2]
	frequency_non_mean = ((occurrences_non_r1 + occurrences_non_r2) / (words_r1 + words_r2)) * 1000
	# standard_deviation_non = statistics.pstdev(frequency_non_values, mu=frequency_non_mean)
	standard_deviation_non = pstdev(frequency_non_values, mu=frequency_non_mean)
	percentage_non_r1 = ((frequency_non_r1 - frequency_non_mean) / frequency_non_mean) * 100
	percentage_non_r2 = ((frequency_non_r2 - frequency_non_mean) / frequency_non_mean) * 100
	string_non = 'occurrences of \'non\' per 1,000 words'
	print(f'{frequency_non_r1:7.4f} {string_non} (R1)')
	print(f'{frequency_non_r2:7.4f} {string_non} (R2)')
	print(f'{frequency_non_mean:7.4f} {string_non} (mean)')
	print(f'{standard_deviation_non:7.4f} {string_non} (standard deviation)')
	if percentage_non_r1 > 0: more_or_less = 'more'
	else: more_or_less = 'less'
	print(f'\'non\' occurs {abs(percentage_non_r1):.2f}% {more_or_less} frequently in R1 than in mean')
	if percentage_non_r2 > 0: more_or_less = 'more'
	else: more_or_less = 'less'
	print(f'\'non\' occurs {abs(percentage_non_r2):.2f}% {more_or_less} frequently in R2 than in mean')

	if frequency_view:
	value_in_r1 = frequency_in_r1
	value_non_r1 = frequency_non_r1
	value_in_r2 = frequency_in_r2
	value_non_r2 = frequency_non_r2
	pp.axis([frequency_in_mean - 2 * standard_deviation_in,
	frequency_in_mean + 2 * standard_deviation_in,
	frequency_non_mean - 2 * standard_deviation_non,
	frequency_non_mean + 2 * standard_deviation_non])
	pp.axhline(frequency_non_mean,linestyle='dashed')
	pp.axvline(frequency_in_mean, linestyle='dashed')
	pp.xlabel('frequency of occurrence of $\it{in}$ per 1,000 words')
	pp.ylabel('frequency of occurrence of $\it{non}$ per 1,000 words')
	else: #standard deviation view
	value_in_r1 = (frequency_in_r1 - frequency_in_mean) / standard_deviation_in
	value_non_r1 = (frequency_non_r1 - frequency_non_mean) / standard_deviation_non
	value_in_r2 = (frequency_in_r2 - frequency_in_mean) / standard_deviation_in
	value_non_r2 = (frequency_non_r2 - frequency_non_mean) / standard_deviation_non
	pp.axis([-2, 2, -2, 2])
	pp.axhline(linestyle='dashed')
	pp.axvline(linestyle='dashed')
	pp.xticks([-1, 0, 1])
	pp.yticks([-1, 0, 1])
	pp.xlabel('$\it{in}$')
	pp.ylabel('$\it{non}$', rotation='horizontal')

	pp.annotate(
	f'R1 ({value_in_r1:.3f}, {value_non_r1:.3f})',
	(value_in_r1, value_non_r1),
	textcoords="offset points",
	xytext=(0,10),
	ha='center'
	)
	pp.annotate(
	f'R2 ({value_in_r2:.3f}, {value_non_r2:.3f})',
	(value_in_r2, value_non_r2),
	textcoords="offset points",
	xytext=(0,10),
	ha='center'
	)
	in_values = [value_in_r1, value_in_r2]
	non_values = [value_non_r1, value_non_r2]
	pp.scatter(in_values, non_values)
	if frequency_view:
	title_string = '(frequency view, '
	filename = './PNGs/Figure_0_frequency_'
	else:
	title_string = '(z-score view, '
	filename = './PNGs/Figure_0_z-score_'
	if de_pen:
	title_string += 'including $\it{de Pen.}$)'
	filename += 'including_de_Pen.png'
	else:
	title_string += 'excluding $\it{de Pen.}$)'
	filename += 'excluding_de_Pen.png'
	pp.title(title_string)
	pp.savefig(filename)
	pp.gcf().canvas.set_window_title('Figure 0')
	pp.show()