jfbercher · August 29, 2015 14:16
diff --git a/jfb_kstest.py b/jfb_kstest.py
 # Last update: october 19, 2014
 ## Kolmogorov-Smirnov test
 # Author: JF Bercher
 # This work is licensed under CreativeCommons  Attribution-ShareAlike license
 # http://creativecommons.org/licenses/by-sa/4.0/

 from pylab import *
 import numpy as np

 def ecdf(v_data):
    """
    Created on Tue Apr  1 13:00:43 2014
    
    Adapted and improved from http://www.mathworks.com/matlabcentral/fileexchange/32831-homemade-ecdf/content/homemade_ecdf.m
    Copyright (c) 2011, Mathieu Boutin
    
    """
    nb_data = np.size(v_data)
    v_sorted_data = sorted(v_data)
    v_unique_data = unique(v_data)
    nb_unique_data = size(v_unique_data)
    v_data_ecdf = zeros((nb_unique_data))
    vx=zeros((nb_unique_data+1))
    vx[0]=v_unique_data[0]; vx[1:]=v_unique_data
    vf=zeros((nb_unique_data+1))
    for index in range(nb_unique_data):
        current_data = v_unique_data[index]
        vf[index+1] = sum(v_sorted_data <= current_data)/nb_data

    return [vx,vf] 

 #    
 def ks(data,distrib,alpha=0.05,doplot=False,verbose=False,lang='eng'):
     """
    Compute the Kolmogorov-Smirnov test for a confidence level alpha
    
    .. math:: Pr[Ktest>Kth|H0]=\\alpha
    (the probablity to reject H0 under H0 is less than alpha)
    
    Returns the test result and a p-value
    
    .. math:: p=Pr[x>Ktest|H0]
    (the probability of that the test under the null hypothesis takes a value equal or larger than the actual one) 
    
    Author: jfb, 05/2014 last updates: 09/2014
    
   Parameters
   ----------
    
    data :  array_like
            raw data for the statistical test
    distrib : object
            an object for the hypothesized distribution,
            with a method cdf()
    alpha : float, <1, default alpha=0.5
            confidence level for the test            
    doplot : bool, default to False
            plots some figures
    verbose :  bool, default to False
            prints some information   
    lang :   str
            prints in lang (eng [default] or fr (french))  
            

   Returns
   -------
    
    Ktest : float
            value of the test
    Kth :   float
            value of the test threshold at confidence alpha
    p :     float
            p-value
            
   Refs
   ----

     - http://mistis.inrialpes.fr/software/SMEL/cours/ts/node7.html
     - http://www.iecn.u-nancy.fr/~pincon/scilab/Doc/node94.html
     
   Example
   ------- 
     >>>  x=randn(1000); ks(x, stats.norm,alpha=0.05,verbose=True,doplot=True)
    Valeur du test : 0.4691
    Valeur du seuil pour une tolérance à  5 %: 1.2186
    Hypothèse acceptée
    p-value: 0.9803539070042061
    Out[61]: (0.4691265699087695, 1.218602952573461, 0.9803539070042061)  
     """
        
     n=np.size(data)
     vx,empF=ecdf(data)
     F=distrib.cdf(vx)
     if doplot:     
          figure()
          plot(vx,F,vx,empF)
     Km_plus=abs(F-empF)
     Km_minus=abs(F[1:]-empF[:-1])
               
     Ktest=sqrt(n)*max(max(Km_plus),max(Km_minus))
     Kth=sqrt(0.5*log(1/alpha))-1/(6*sqrt(n))
     t=Ktest
     if verbose:
         print('-'*60)
         if lang=='fr':           
             print("Valeur du test : {0:2.4f}".format(Ktest))     
             print("Valeur du seuil pour une tolérance à {1:2d} %: {0:2.4f}".format(Kth,int(alpha*100)))     
         else:
             print("Test value: {0:2.4f}".format(Ktest))     
             print("Theoretical value of the test for a tolerance at {1:2d} %: {0:2.4f}".format(Kth,int(alpha*100)))                    
     
         if Ktest>Kth:
              if lang=='fr': print("Hypothèse rejetée") 
              else: print("Hypothesis rejected")
         else:
              if lang=='fr': print("Hypothèse acceptée") 
              else: print("Hypothesis accepted")
     p=0
     for k in range(1,10):
         p=p+2*(-1)**(k+1)*exp(-2*k**2*t**2)
     if verbose: print("p-value: {}".format(p))
     return Ktest,Kth,p         
 #

 t="""
 the p-value is the probability that, if the unknown distribution were were 
 really the hypothesized one (U(0,1) in your example), the K-S statistic would 
 take on a value as large or larger than the value it actually did take on. 
 The K-S statistic is a measure of the difference between the empirical CDF of 
 your data, and the CDF of the distribution that you are testing against.


 The p-value is _not_ the probability of the null hypothesis being true or 
 false. In fact, under the usual hypothesis testing model, there is _no such 
 probability_ because the truth of hypothesis is not a random quantity. 
 However, the p-value is often interpreted as a measure of plausibility of the 
 null hypothesis, i.e., small values shed doubt on H0. On the other hand, 
 large values do not confirm H0, they merely fail to demonstrate evidence 
 against it. Peter Perkins, http://www.mathworks.com/matlabcentral/newsreader/view_thread/48698
 """
	# Last update: october 19, 2014
	## Kolmogorov-Smirnov test
	# Author: JF Bercher
	# This work is licensed under CreativeCommons Attribution-ShareAlike license
	# http://creativecommons.org/licenses/by-sa/4.0/

	from pylab import *
	import numpy as np

	def ecdf(v_data):
	"""
	Created on Tue Apr 1 13:00:43 2014

	Adapted and improved from http://www.mathworks.com/matlabcentral/fileexchange/32831-homemade-ecdf/content/homemade_ecdf.m
	Copyright (c) 2011, Mathieu Boutin

	"""
	nb_data = np.size(v_data)
	v_sorted_data = sorted(v_data)
	v_unique_data = unique(v_data)
	nb_unique_data = size(v_unique_data)
	v_data_ecdf = zeros((nb_unique_data))
	vx=zeros((nb_unique_data+1))
	vx[0]=v_unique_data[0]; vx[1:]=v_unique_data
	vf=zeros((nb_unique_data+1))
	for index in range(nb_unique_data):
	current_data = v_unique_data[index]
	vf[index+1] = sum(v_sorted_data <= current_data)/nb_data

	return [vx,vf]

	#
	def ks(data,distrib,alpha=0.05,doplot=False,verbose=False,lang='eng'):
	"""
	Compute the Kolmogorov-Smirnov test for a confidence level alpha

	.. math:: Pr[Ktest>Kth\|H0]=\\alpha
	(the probablity to reject H0 under H0 is less than alpha)

	Returns the test result and a p-value

	.. math:: p=Pr[x>Ktest\|H0]
	(the probability of that the test under the null hypothesis takes a value equal or larger than the actual one)

	Author: jfb, 05/2014 last updates: 09/2014

	Parameters
	----------

	data : array_like
	raw data for the statistical test
	distrib : object
	an object for the hypothesized distribution,
	with a method cdf()
	alpha : float, <1, default alpha=0.5
	confidence level for the test
	doplot : bool, default to False
	plots some figures
	verbose : bool, default to False
	prints some information
	lang : str
	prints in lang (eng [default] or fr (french))


	Returns
	-------

	Ktest : float
	value of the test
	Kth : float
	value of the test threshold at confidence alpha
	p : float
	p-value

	Refs
	----

	- http://mistis.inrialpes.fr/software/SMEL/cours/ts/node7.html
	- http://www.iecn.u-nancy.fr/~pincon/scilab/Doc/node94.html

	Example
	-------
	>>> x=randn(1000); ks(x, stats.norm,alpha=0.05,verbose=True,doplot=True)
	Valeur du test : 0.4691
	Valeur du seuil pour une tolérance à 5 %: 1.2186
	Hypothèse acceptée
	p-value: 0.9803539070042061
	Out[61]: (0.4691265699087695, 1.218602952573461, 0.9803539070042061)
	"""

	n=np.size(data)
	vx,empF=ecdf(data)
	F=distrib.cdf(vx)
	if doplot:
	figure()
	plot(vx,F,vx,empF)
	Km_plus=abs(F-empF)
	Km_minus=abs(F[1:]-empF[:-1])

	Ktest=sqrt(n)*max(max(Km_plus),max(Km_minus))
	Kth=sqrt(0.5log(1/alpha))-1/(6sqrt(n))
	t=Ktest
	if verbose:
	print('-'*60)
	if lang=='fr':
	print("Valeur du test : {0:2.4f}".format(Ktest))
	print("Valeur du seuil pour une tolérance à {1:2d} %: {0:2.4f}".format(Kth,int(alpha*100)))
	else:
	print("Test value: {0:2.4f}".format(Ktest))
	print("Theoretical value of the test for a tolerance at {1:2d} %: {0:2.4f}".format(Kth,int(alpha*100)))

	if Ktest>Kth:
	if lang=='fr': print("Hypothèse rejetée")
	else: print("Hypothesis rejected")
	else:
	if lang=='fr': print("Hypothèse acceptée")
	else: print("Hypothesis accepted")
	p=0
	for k in range(1,10):
	p=p+2(-1)(k+1)exp(-2k2t**2)
	if verbose: print("p-value: {}".format(p))
	return Ktest,Kth,p
	#

	t="""
	the p-value is the probability that, if the unknown distribution were were
	really the hypothesized one (U(0,1) in your example), the K-S statistic would
	take on a value as large or larger than the value it actually did take on.
	The K-S statistic is a measure of the difference between the empirical CDF of
	your data, and the CDF of the distribution that you are testing against.


	The p-value is _not_ the probability of the null hypothesis being true or
	false. In fact, under the usual hypothesis testing model, there is _no such
	probability_ because the truth of hypothesis is not a random quantity.
	However, the p-value is often interpreted as a measure of plausibility of the
	null hypothesis, i.e., small values shed doubt on H0. On the other hand,
	large values do not confirm H0, they merely fail to demonstrate evidence
	against it. Peter Perkins, http://www.mathworks.com/matlabcentral/newsreader/view_thread/48698
	"""