Last active
May 29, 2017 13:08
-
-
Save numpde/832167bf22709305ef05f2bc5d80ba54 to your computer and use it in GitHub Desktop.
Histograms for the number of types V(N) within the first N tokens
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# Histograms for the number of types V(N) within the first N tokens. | |
# Comparison of natives vs learners. | |
# R. Andreev, 2017-05-11 (first version), CC BY 4.0 | |
# Designed for the ANGLISH corpus [Tortel 2008, via N. Ballier & P. Lisson] | |
# The texts are expected to be located in ./ANGLISH/*.txt | |
import os, sys, argparse | |
import matplotlib.pyplot as plt | |
import matplotlib.mlab as mlab | |
import numpy as np | |
from scipy import stats | |
from glob import glob | |
from random import shuffle, randrange, choice | |
from itertools import accumulate | |
# This function analyses a text file "filename", | |
# returning a pair (S, R), where | |
# S[n] is the n-th token, | |
# R[n] is 1 if the token appears for the first time. | |
# Thus | |
# n -> (n, accumulate(R)[n]) | |
# is the vocabulary growth curve. | |
def vgc(filename) : | |
# Collate lines, separating them by a space | |
f = open(filename, 'r') | |
S = ' '.join(f.readlines()) | |
f.close() | |
# Remove non-text | |
S = "".join(c for c in S if (c.isalnum() or (c == ' '))) | |
# Split into a list of words | |
S = S.split() | |
#print(S) | |
R = [] | |
for (n, w) in enumerate(S) : | |
# repeated word? | |
r = (n > 0) and (w in S[0:n-1]) | |
R.append(int(not r)) | |
return (S, R) | |
# Compute the mean and the std dev of a list L | |
def mean_std(L) : | |
return (np.mean(np.asarray(L)), np.std(np.asarray(L), ddof=1)) | |
def main() : | |
# FIRST SET OF TEXTS | |
FR = [] | |
for f in glob("./ANGLISH/F*FR*.txt") : | |
(S, R) = vgc(f) | |
FR.append(list(accumulate(R))) | |
# SECOND SET OF TEXTS | |
GB = [] | |
for f in glob("./ANGLISH/F*GB*.txt") : | |
(S, R) = vgc(f) | |
GB.append(list(accumulate(R))) | |
for f in glob("./ANGLISH/H*GB*.txt") : | |
(S, R) = vgc(f) | |
GB.append(list(accumulate(R))) | |
# TEXT CUT-OFF LENGTH (TOKENS) | |
for N in [50, 100, 150, 200, 250, 300] : | |
# Compute V(N) | |
frN = [a[N-1] for a in FR if (len(a) >= N)] | |
gbN = [a[N-1] for a in GB if (len(a) >= N)] | |
print("N = {}".format(N)) | |
print("V(N) for FR:", frN) | |
print("V(N) for GB:", gbN) | |
# https://stats.stackexchange.com/questions/13326/ | |
print(stats.ks_2samp(frN, gbN)) | |
bins = np.linspace(30, 170, 20) | |
xx = np.linspace(min(bins), max(bins), 1000) | |
dx = (max(bins) - min(bins)) / (len(bins) - 1) | |
w = np.ones_like(frN); w = w / sum(w) | |
plt.hist(frN, bins, alpha=0.3, color="red", label="Learner", normed=1) | |
(m, s) = mean_std(frN) | |
plt.plot(xx, mlab.normpdf(xx, m, s), color="red") | |
w = np.ones_like(gbN); w = w / sum(w) | |
plt.hist(gbN, bins, alpha=0.3, color="blue", label="Native", normed=1) | |
(m, s) = mean_std(gbN) | |
plt.plot(xx, mlab.normpdf(xx, m, s), color="blue") | |
plt.legend(loc='upper right') | |
plt.title("V(N = {}); # FR texts = {}, # GB texts = {}".format(N, len(frN), len(gbN))) | |
plt.xlim([min(bins), max(bins)]) | |
#plt.ylim([0, 1]) | |
f = "hist_N={}.png".format(N) | |
plt.savefig(f, bbox_inches='tight') | |
plt.show() | |
print(" ") | |
return | |
if (__name__ == "__main__") : | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment