numpde · May 29, 2017 13:08
diff --git a/histo.py b/histo.py
 #!/usr/bin/python3

 # Histograms for the number of types V(N) within the first N tokens.
 # Comparison of natives vs learners.

 # R. Andreev, 2017-05-11 (first version), CC BY 4.0

 # Designed for the ANGLISH corpus [Tortel 2008, via N. Ballier & P. Lisson]
 # The texts are expected to be located in ./ANGLISH/*.txt

 import os, sys, argparse
 import matplotlib.pyplot as plt
 import matplotlib.mlab as mlab
 import numpy as np

 from scipy import stats
 from glob import glob
 from random import shuffle, randrange, choice
 from itertools import accumulate

 # This function analyses a text file "filename",
 # returning a pair (S, R), where
 #   S[n] is the n-th token,
 #   R[n] is 1 if the token appears for the first time.
 # Thus
 #   n -> (n, accumulate(R)[n])
 # is the vocabulary growth curve.
 def vgc(filename) :
    # Collate lines, separating them by a space
    f = open(filename, 'r')
    S = ' '.join(f.readlines())
    f.close()

    # Remove non-text
    S = "".join(c for c in S if (c.isalnum() or (c == ' ')))
    # Split into a list of words
    S = S.split()

    #print(S)

    R = []
    for (n, w) in enumerate(S) :
        # repeated word?
        r = (n > 0) and (w in S[0:n-1])
        R.append(int(not r))

    return (S, R)

 # Compute the mean and the std dev of a list L
 def mean_std(L) :
    return (np.mean(np.asarray(L)), np.std(np.asarray(L), ddof=1))

 def main() :
    # FIRST SET OF TEXTS
    FR = []
    for f in glob("./ANGLISH/F*FR*.txt") :
        (S, R) = vgc(f)
        FR.append(list(accumulate(R)))
    
    # SECOND SET OF TEXTS
    GB = []
    for f in glob("./ANGLISH/F*GB*.txt") :
        (S, R) = vgc(f)
        GB.append(list(accumulate(R)))
    for f in glob("./ANGLISH/H*GB*.txt") :
        (S, R) = vgc(f)
        GB.append(list(accumulate(R)))
    
    # TEXT CUT-OFF LENGTH (TOKENS)
    for N in [50, 100, 150, 200, 250, 300] :
        # Compute V(N)
        frN = [a[N-1] for a in FR if (len(a) >= N)]
        gbN = [a[N-1] for a in GB if (len(a) >= N)]
        print("N = {}".format(N))
        print("V(N) for FR:", frN)
        print("V(N) for GB:", gbN)
        
        # https://stats.stackexchange.com/questions/13326/
        print(stats.ks_2samp(frN, gbN))
        
        bins = np.linspace(30, 170, 20)
        xx = np.linspace(min(bins), max(bins), 1000)
        dx = (max(bins) - min(bins)) / (len(bins) - 1)

        w = np.ones_like(frN); w = w / sum(w)
        plt.hist(frN, bins, alpha=0.3, color="red", label="Learner", normed=1)
        (m, s) = mean_std(frN) 
        plt.plot(xx, mlab.normpdf(xx, m, s), color="red")
        
        w = np.ones_like(gbN); w = w / sum(w)
        plt.hist(gbN, bins, alpha=0.3, color="blue", label="Native", normed=1)
        (m, s) = mean_std(gbN)
        plt.plot(xx, mlab.normpdf(xx, m, s), color="blue")
        
        plt.legend(loc='upper right')
        plt.title("V(N = {}); # FR texts = {}, # GB texts = {}".format(N, len(frN), len(gbN)))

        plt.xlim([min(bins), max(bins)])
        #plt.ylim([0, 1])

        f = "hist_N={}.png".format(N)
        plt.savefig(f, bbox_inches='tight')
    
        plt.show()
    
        print(" ")
    return


 if (__name__ == "__main__") :
    main()
	#!/usr/bin/python3

	# Histograms for the number of types V(N) within the first N tokens.
	# Comparison of natives vs learners.

	# R. Andreev, 2017-05-11 (first version), CC BY 4.0

	# Designed for the ANGLISH corpus [Tortel 2008, via N. Ballier & P. Lisson]
	# The texts are expected to be located in ./ANGLISH/*.txt

	import os, sys, argparse
	import matplotlib.pyplot as plt
	import matplotlib.mlab as mlab
	import numpy as np

	from scipy import stats
	from glob import glob
	from random import shuffle, randrange, choice
	from itertools import accumulate

	# This function analyses a text file "filename",
	# returning a pair (S, R), where
	# S[n] is the n-th token,
	# R[n] is 1 if the token appears for the first time.
	# Thus
	# n -> (n, accumulate(R)[n])
	# is the vocabulary growth curve.
	def vgc(filename) :
	# Collate lines, separating them by a space
	f = open(filename, 'r')
	S = ' '.join(f.readlines())
	f.close()

	# Remove non-text
	S = "".join(c for c in S if (c.isalnum() or (c == ' ')))
	# Split into a list of words
	S = S.split()

	#print(S)

	R = []
	for (n, w) in enumerate(S) :
	# repeated word?
	r = (n > 0) and (w in S[0:n-1])
	R.append(int(not r))

	return (S, R)

	# Compute the mean and the std dev of a list L
	def mean_std(L) :
	return (np.mean(np.asarray(L)), np.std(np.asarray(L), ddof=1))

	def main() :
	# FIRST SET OF TEXTS
	FR = []
	for f in glob("./ANGLISH/FFR.txt") :
	(S, R) = vgc(f)
	FR.append(list(accumulate(R)))

	# SECOND SET OF TEXTS
	GB = []
	for f in glob("./ANGLISH/FGB.txt") :
	(S, R) = vgc(f)
	GB.append(list(accumulate(R)))
	for f in glob("./ANGLISH/HGB.txt") :
	(S, R) = vgc(f)
	GB.append(list(accumulate(R)))

	# TEXT CUT-OFF LENGTH (TOKENS)
	for N in [50, 100, 150, 200, 250, 300] :
	# Compute V(N)
	frN = [a[N-1] for a in FR if (len(a) >= N)]
	gbN = [a[N-1] for a in GB if (len(a) >= N)]
	print("N = {}".format(N))
	print("V(N) for FR:", frN)
	print("V(N) for GB:", gbN)

	# https://stats.stackexchange.com/questions/13326/
	print(stats.ks_2samp(frN, gbN))

	bins = np.linspace(30, 170, 20)
	xx = np.linspace(min(bins), max(bins), 1000)
	dx = (max(bins) - min(bins)) / (len(bins) - 1)

	w = np.ones_like(frN); w = w / sum(w)
	plt.hist(frN, bins, alpha=0.3, color="red", label="Learner", normed=1)
	(m, s) = mean_std(frN)
	plt.plot(xx, mlab.normpdf(xx, m, s), color="red")

	w = np.ones_like(gbN); w = w / sum(w)
	plt.hist(gbN, bins, alpha=0.3, color="blue", label="Native", normed=1)
	(m, s) = mean_std(gbN)
	plt.plot(xx, mlab.normpdf(xx, m, s), color="blue")

	plt.legend(loc='upper right')
	plt.title("V(N = {}); # FR texts = {}, # GB texts = {}".format(N, len(frN), len(gbN)))

	plt.xlim([min(bins), max(bins)])
	#plt.ylim([0, 1])

	f = "hist_N={}.png".format(N)
	plt.savefig(f, bbox_inches='tight')

	plt.show()

	print(" ")
	return


	if (__name__ == "__main__") :
	main()