xeoncross · October 16, 2012 16:38
diff --git a/readme.md b/readme.md
diff --git a/anova_data.txt b/anova_data.txt
 model	mpg	cyl!	disp	hp	drat	wt	qsec	vs	am!	gear!	carb!
 Mazda RX4	21	6	160	110	3.9	2.62	16.46	0	1	4	4
 Mazda RX4 Wag	21	6	160	110	3.9	2.875	17.02	0	1	4	4
 Datsun 710	22.8	4	108	93	3.85	2.32	18.61	1	1	4	1
 Hornet 4 Drive	21.4	6	258	110	3.08	3.215	19.44	1	0	3	1
 Hornet Sportabout	18.7	8	360	175	3.15	3.44	17.02	0	0	3	2
 Valiant	18.1	6	225	105	2.76	3.46	20.22	1	0	3	1
 Duster 360	14.3	8	360	245	3.21	3.57	15.84	0	0	3	4
 Merc 240D	24.4	4	146.7	62	3.69	3.19	20	1	0	4	2
 Merc 230	22.8	4	140.8	95	3.92	3.15	22.9	1	0	4	2
 Merc 280	19.2	6	167.6	123	3.92	3.44	18.3	1	0	4	4
 Merc 280C	17.8	6	167.6	123	3.92	3.44	18.9	1	0	4	4
 Merc 450SE	16.4	8	275.8	180	3.07	4.07	17.4	0	0	3	3
 Merc 450SL	17.3	8	275.8	180	3.07	3.73	17.6	0	0	3	3
 Merc 450SLC	15.2	8	275.8	180	3.07	3.78	18	0	0	3	3
 Cadillac Fleetwood	10.4	8	472	205	2.93	5.25	17.98	0	0	3	4
 Lincoln Continental	10.4	8	460	215	3	5.424	17.82	0	0	3	4
 Chrysler Imperial	14.7	8	440	230	3.23	5.345	17.42	0	0	3	4
 Fiat 128	32.4	4	78.7	66	4.08	2.2	19.47	1	1	4	1
 Honda Civic	30.4	4	75.7	52	4.93	1.615	18.52	1	1	4	2
 Toyota Corolla	33.9	4	71.1	65	4.22	1.835	19.9	1	1	4	1
 Toyota Corona	21.5	4	120.1	97	3.7	2.465	20.01	1	0	3	1
 Dodge Challenger	15.5	8	318	150	2.76	3.52	16.87	0	0	3	2
 AMC Javelin	15.2	8	304	150	3.15	3.435	17.3	0	0	3	2
 Camaro Z28	13.3	8	350	245	3.73	3.84	15.41	0	0	3	4
 Pontiac Firebird	19.2	8	400	175	3.08	3.845	17.05	0	0	3	2
 Fiat X1-9	27.3	4	79	66	4.08	1.935	18.9	1	1	4	1
 Porsche 914-2	26	4	120.3	91	4.43	2.14	16.7	0	1	5	2
 Lotus Europa	30.4	4	95.1	113	3.77	1.513	16.9	1	1	5	2
 Ford Pantera L	15.8	8	351	264	4.22	3.17	14.5	0	1	5	4
 Ferrari Dino	19.7	6	145	175	3.62	2.77	15.5	0	1	5	6
 Maserati Bora	15	8	301	335	3.54	3.57	14.6	0	1	5	8
 Volvo 142E	21.4	4	121	109	4.11	2.78	18.6	1	1	4	2
diff --git a/data.txt b/data.txt
 model	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
 Mazda RX4	21	6	160	110	3.9	2.62	16.46	0	1	4	4
 Mazda RX4 Wag	21	6	160	110	3.9	2.875	17.02	0	1	4	4
 Datsun 710	22.8	4	108	93	3.85	2.32	18.61	1	1	4	1
 Hornet 4 Drive	21.4	6	258	110	3.08	3.215	19.44	1	0	3	1
 Hornet Sportabout	18.7	8	360	175	3.15	3.44	17.02	0	0	3	2
 Valiant	18.1	6	225	105	2.76	3.46	20.22	1	0	3	1
 Duster 360	14.3	8	360	245	3.21	3.57	15.84	0	0	3	4
 Merc 240D	24.4	4	146.7	62	3.69	3.19	20	1	0	4	2
 Merc 230	22.8	4	140.8	95	3.92	3.15	22.9	1	0	4	2
 Merc 280	19.2	6	167.6	123	3.92	3.44	18.3	1	0	4	4
 Merc 280C	17.8	6	167.6	123	3.92	3.44	18.9	1	0	4	4
 Merc 450SE	16.4	8	275.8	180	3.07	4.07	17.4	0	0	3	3
 Merc 450SL	17.3	8	275.8	180	3.07	3.73	17.6	0	0	3	3
 Merc 450SLC	15.2	8	275.8	180	3.07	3.78	18	0	0	3	3
 Cadillac Fleetwood	10.4	8	472	205	2.93	5.25	17.98	0	0	3	4
 Lincoln Continental	10.4	8	460	215	3	5.424	17.82	0	0	3	4
 Chrysler Imperial	14.7	8	440	230	3.23	5.345	17.42	0	0	3	4
 Fiat 128	32.4	4	78.7	66	4.08	2.2	19.47	1	1	4	1
 Honda Civic	30.4	4	75.7	52	4.93	1.615	18.52	1	1	4	2
 Toyota Corolla	33.9	4	71.1	65	4.22	1.835	19.9	1	1	4	1
 Toyota Corona	21.5	4	120.1	97	3.7	2.465	20.01	1	0	3	1
 Dodge Challenger	15.5	8	318	150	2.76	3.52	16.87	0	0	3	2
 AMC Javelin	15.2	8	304	150	3.15	3.435	17.3	0	0	3	2
 Camaro Z28	13.3	8	350	245	3.73	3.84	15.41	0	0	3	4
 Pontiac Firebird	19.2	8	400	175	3.08	3.845	17.05	0	0	3	2
 Fiat X1-9	27.3	4	79	66	4.08	1.935	18.9	1	1	4	1
 Porsche 914-2	26	4	120.3	91	4.43	2.14	16.7	0	1	5	2
 Lotus Europa	30.4	4	95.1	113	3.77	1.513	16.9	1	1	5	2
 Ford Pantera L	15.8	8	351	264	4.22	3.17	14.5	0	1	5	4
 Ferrari Dino	19.7	6	145	175	3.62	2.77	15.5	0	1	5	6
 Maserati Bora	15	8	301	335	3.54	3.57	14.6	0	1	5	8
 Volvo 142E	21.4	4	121	109	4.11	2.78	18.6	1	1	4	2
diff --git a/example.php b/example.php
 <?php  

 $sep = "\t"; $nl  = "\n";

 $content = file_get_contents('data.txt'); 

 $records = explode($nl, $content); 
 $header  = explode($sep, trim(array_shift($records))); 
 $data    = array_fill_keys($header, array()); 

 foreach ($records as $id=>$record) { 
    $record = trim($record); 
    $fields = explode($sep, $record); 
    $titles = $header; 
     
    foreach ($fields as $field) { 
        $title = array_shift($titles); 
        $data[$title][] = $field; 
    } 
 } 

 $x = $data['wt']; 
 $y = $data['mpg']; 

 require('kashi.php'); 

 // $kashi = new Kashi($dbname, $dbuser, $dbpass, $dbhost); 
 $kashi = new Kashi('test', 'root', '', 'localhost'); 

 /**
 * Summary Statistics:
 */

 // $x is an array of values 
 echo 'Mean: ' . $kashi->mean($x) . '<br />'; 

 // It will use previous data set if you don't provide one as an argument to the method 
 echo 'Mode: '     . print_r($kashi->mode()) . '<br />'; 
 echo 'Median: '   . $kashi->median()   . '<br />'; 
 echo 'Variance: ' . $kashi->variance() . '<br />'; 
 echo 'SD: '       . $kashi->sd()       . '<br />'; 
 echo '%CV: '      . $kashi->cv()       . '<br />'; 
 echo 'Skewness: ' . $kashi->skew()     . '<br />'; 
 echo 'Kurtosis: ' . $kashi->kurt()     . '<br />'; 

 /**
 * Correlation, Regression, and t-Test:
 */
 echo 'Covariance: '  . $kashi->cov($x, $y) . '<br />'; 
 echo 'Correlation: ' . $kashi->cor($x, $y) . '<br />'; 

 $r = $kashi->cor($x, $y); 
 $n = count($x); 
 echo 'Significant of Correlation: ' . $kashi->corTest($r, $n) . '<br />'; 

 echo 'Regression: ' . print_r($kashi->lm($x, $y), true) . '<br />'; 

 echo 't-Test unpaired: ' . $kashi->tTest($x, $y, false) . '<br />'; 
 echo 'Test: ' . $kashi->tDist($kashi->tTest($x, $y, false), (count($x)-1)*(count($y)-1)) . '<br />'; 

 echo 't-Test paired: ' . $kashi->tTest($x, $y, true) . '<br />'; 
 echo 'Test: ' . $kashi->tDist($kashi->tTest($x, $y, true), count($x)-1) . '<br />'; 

 /**
 * Distributions:
 */
 echo 'Normal distribution (x=0.5, mean=0, sd=1): '  . $kashi->norm(0.5, 0, 1) . '<br />'; 

 echo 'Probability for the Student t-distribution (t=3, n=10) one-tailed: ';  
 echo $kashi->tDist(3, 10, 1) . '<br />'; 

 echo 'Probability for the Student t-distribution (t=3, n=10) two-tailed: ';  
 echo $kashi->tDist(3, 10, 2) . '<br />'; 

 echo 'F probability distribution (f=2, df1=12, df2=15): '  . $kashi->fDist(2, 12, 15) . '<br />'; 

 echo 'Inverse of the standard normal cumulative distribution (p=0.95): '; 
 echo $kashi->inverseNormCDF(0.95) . '<br />';

 echo 't-value of the Student\'s t-distribution (p=0.05, n=29): '; 
 echo $kashi->inverseTCDF(0.05, 29) . '<br />';

 /**
 * Chi-square test or Contingency tables (A/B testing):
 */
 $table['Automatic'] = array('4 Cylinders' => 3, '6 Cylinders' => 4, '8 Cylinders' => 12);
 $table['Manual']    = array('4 Cylinders' => 8, '6 Cylinders' => 3, '8 Cylinders' => 2);

 $result = $kashi->chiTest($table);

 $probability = $kashi->chiDist($result['chi'], $result['df']); 
 echo 'Chi-square test probability: ' . $probability . '<br />'; 

 /**
 * Diversity index:
 */
 $gear = array('3' => 15, '4' => 12, '5' => 5); 
 $cyl  = array('4' => 11, '6' => 7, '8' => 14); 

 echo 'Shannon index for gear: ' . $kashi->diversity($gear) . '<br />'; 
 echo 'Shannon index for cyl: ' . $kashi->diversity($cyl) . '<br />'; 

 /**
 * ANOVA:
 */
 require('kashi_anova.php');

 // $obj = new KashiANOVA($dbname, $dbuser, $dbpass, $dbhost);
 $obj = new KashiANOVA('test', 'root', '', 'localhost');

 $str = file_get_contents('anova_data.txt');
 $obj->loadString($str); 

 // mpg ~ cyl
 $result = $obj->anova('cyl', 'mpg');
 echo '<hr />Analysis of variance (ANOVA): mpg ~ cyl<pre>';
 print_r($result);
 echo '</pre>';
diff --git a/kashi.php b/kashi.php
 <?php

 /**
 * ----------------------------------------------------------------------
 *
 * Copyright (c) 2012 Khaled Al-Shamaa.
 *
 * http://www.ar-php.org
 *
 * PHP Version 5
 *
 * ----------------------------------------------------------------------
 * @desc      We aim in Al-Kashi project to provide a rich package full of statistical
 *            functions useful for online business intelligent and data mining, possible
 *            applications may include an online log file analysis, Ad's and Campaign
 *            statistics, or survey/voting results on-fly analysis.
 *
 * @category  Math
 * @package   Kashi
 * @author    Khaled Al-Shamaa <[email protected]>
 * @copyright 2012 Khaled Al-Shamaa
 *
 * @license   GPL <http://www.gnu.org/licenses/gpl.txt>
 * @version   1.0.0 released in March 5, 2012
 * @link      http://www.ar-php.org/stats/al-kashi
 */

 class Kashi
 {
    private $values = array();
    private $dataset = array();

    public function __construct()
    {
    }

    public function __destruct()
    {
    }

    /**
     * Compute the arithmetic mean, and is calculated by adding a group of numbers
     * and then dividing by the count of those numbers.
     * For example, the mean of 2, 3, 3, 5, 7, and 10 is 30 divided by 6, which is 5.
     *
     * @param array List of float values for which you want to calculate the mean.
     *
     * @return float Arithmetic mean
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function mean()
    {
        if (func_num_args() > 0) {
            $this->values = func_get_arg(0);
        }

        $total = 0;

        foreach ($this->values as $value) {
            $total += $value;
        }

        return $total/count($this->values);
    }

    /**
     * Compute the sample median which is the middle number of a group of numbers; that is,
     * half the numbers have values that are greater than the median, and half the numbers
     * have values that are less than the median.
     * For example, the median of 2, 3, 3, 5, 7, and 10 is 4.
     *
     * @param array List of float values for which you want to calculate the median.
     *
     * @return float Sample median
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function median()
    {
        if (func_num_args() > 0) {
            $this->values = func_get_arg(0);
        }

        $list = $this->values;

        sort($list);

        $count = count($list);

        if ($count % 2 == 0) {
            $median = ($list[($count / 2) - 1] + $list[$count / 2]) / 2;
        } else {
            $median = $list[floor($count / 2)];
        }

        return $median;
    }

    /**
     * Compute the mode which is the most frequently occurring number in a group of numbers.
     * For example, the mode of 2, 3, 3, 5, 7, and 10 is 3.
     *
     * @param array List of float values for which you want to calculate the mode.
     *
     * @return float Returns the most frequently occurring or repetitive value
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function mode()
    {
        if (func_num_args() > 0) {
            $this->values = func_get_arg(0);
        }

        $counter = array();

        foreach ($this->values as $value) {
            if (isset($counter[$value])) {
                $counter[$value]++;
            } else {
                $counter[$value] = 1;
            }
        }

        return array_keys($counter, max($counter));
    }

    /**
     * Estimates variance based on a sample
     *
     * @param array List of float values corresponding to a sample of a population.
     *
     * @return float Returns the sample variance
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function variance()
    {
        if (func_num_args() > 0) {
            $this->values = func_get_arg(0);
        }

        $mean = $this->mean();

        $var = 0;

        foreach ($this->values as $value) {
            $var += ($value - $mean) * ($value - $mean);
        }

        $var = $var / (count($this->values) - 1);

        return $var;
    }

    /**
     * Compute the standard deviation based on a sample. The standard deviation is a measure of
     * how widely values are dispersed from the average value (the mean).
     *
     * @param array List of float values for which you want to calculate the standard deviation.
     *
     * @return float Returns the standard deviation
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function sd()
    {
        if (func_num_args() > 0) {
            $this->values = func_get_arg(0);
        }

        return sqrt($this->variance());
    }

    /**
     * Compute the skewness of a distribution. Skewness characterizes the degree of asymmetry of
     * a distribution around its mean. Positive skewness indicates a distribution with an asymmetric
     * tail extending toward more positive values. Negative skewness indicates a distribution with
     * an asymmetric tail extending toward more negative values.
     *
     * @param array List of float values for which you want to calculate the skewness.
     *
     * @return float Returns the skewness of a distribution
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function skew()
    {
        if (func_num_args() > 0) {
            $this->values = func_get_arg(0);
        }

        $mean = $this->mean();
        $sd   = $this->sd();
        $n    = count($this->values);

        $skew = 0;

        foreach ($this->values as $value) {
            $skew += pow(($value - $mean) / $sd, 3);
        }

        $skew = ($skew * $n) / (($n - 1) * ($n - 2));

        // skew standard error = sqrt(6 / $n)
        // skew is significant if its value exceed 2 * sqrt(6 / $n)
        return $skew;
    }

    /**
     * Compute the kurtosis of a distribution. Kurtosis characterizes the relative peakedness or
     * flatness of a distribution compared with the normal distribution. Positive kurtosis indicates
     * a relatively peaked distribution. Negative kurtosis indicates a relatively flat distribution.
     *
     * @param array List of float values for which you want to calculate the kurtosis.
     *
     * @return float Returns the kurtosis of a distribution
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function kurt()
    {
        if (func_num_args() > 0) {
            $this->values = func_get_arg(0);
        }

        $mean = $this->mean();
        $sd   = $this->sd();
        $n    = count($this->values);

        $kurt = 0;

        foreach ($this->values as $value) {
            $kurt += pow(($value - $mean) / $sd, 4);
        }

        $kurt = ($kurt * $n * ($n + 1)) / (($n - 1) * ($n - 2) * ($n - 3));
        $kurt = $kurt - ((3 * ($n - 1) * ($n - 1)) / (($n - 2) * ($n - 3)));

        // kurt standard error = sqrt(24 / $n)
        // kurt is significant if its value exceed 2 * sqrt(24 / $n)
        return $kurt;
    }

    /**
     * Compute the coefficients of variation
     *
     * @param array List of float values for which you want to calculate the coefficients of variation.
     *
     * @return float Returns the coefficients of variation
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function cv()
    {
        if (func_num_args() > 0) {
            $this->values = func_get_arg(0);
        }

        $mean = $this->mean();
        $sd   = $this->sd();

        $cv = ($sd / $mean) * 100;

        return $cv;
    }

    /**
     * Compute the covariance, the average of the products of deviations for each data point pair.
     * Use covariance to determine the relationship between two data sets. For example, you can
     * examine whether greater income accompanies greater levels of education.
     *
     * @param array $x First list of float values
     * @param array $y Second list of float values
     *
     * @return float Returns the covariance
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function cov($x, $y)
    {
        $meanX = $this->mean($x);
        $meanY = $this->mean($y);

        $count = count($x);
        $total = 0;

        for ($i=0; $i<$count; $i++) {
            $total += ($x[$i] - $meanX) * ($y[$i] - $meanY);
        }

        return (1 / ($count - 1)) * $total;
    }

    /**
     * Compute the correlation coefficient. Use the correlation coefficient to determine the
     * relationship between two properties. It uses different measures of association, all
     * in the range [-1, 1] with 0 indicating no association.
     *
     * @param array $x First list of float values
     * @param array $y Second list of float values
     *
     * @return float Returns the correlation coefficient
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function cor($x, $y)
    {
        $cov = $this->cov($x, $y);
        $sdX = $this->sd($x);
        $sdY = $this->sd($y);

        return $cov / ($sdX * $sdY);
    }

    /**
     * Test of the null hypothesis that true correlation is equal to 0
     *
     * @param array $r Correlation value
     * @param array $n Number of observations
     *
     * @return float Returns null hypothesis probability: true correlation is equal to 0
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function corTest($r, $n)
    {
        $t = $r / sqrt((1 - ($r * $r)) / ($n - 2));

        $result = $this->tDist($t, $n - 2);

        return $result;
    }

    /**
     * Compute the simple linear regression fits a linear model to represent
     * the relationship between a response (or y-) variate, and an explanatory
     * (or x-) variate.
     *
     * @param array $x Specifies the name of the explanatory (or x-) variate.
     * @param array $y Specifies the name of the response (or y-) variate.
     *
     * @return array Returns [intercept], [sploe], and [r-square] as float values
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function lm($x, $y)
    {
        $meanX = $this->mean($x);
        $meanY = $this->mean($y);

        $n     = count($x);

        $nominator   = 0;
        $denominator = 0;

        for ($i=0; $i<$n; $i++) {
            $nominator   += ($x[$i] - $meanX) * ($y[$i] - $meanY);
            $denominator += ($x[$i] - $meanX) * ($x[$i] - $meanX);
        }

        $b = $nominator / $denominator;
        $a = $meanY - $b * $meanX;

        // Total sum of squares (ss) and Residual sum of squares (rss)
        $ss  = 0;
        $rss = 0;

        for ($i=0; $i<$n; $i++) {
            $ss  += pow($y[$i] - $meanY, 2);

            $est  = $a + ($b * $x[$i]);
            $rss += pow($y[$i] - $est, 2);
        }

        // R-square value
        $r2 = 1 - ($rss / $ss);

        return array('intercept'=>$a, 'slope'=>$b, 'r-square'=>$r2);
    }

    /**
     * Compute the Student's t-Test value to determine whether two samples are likely
     * to have come from the same two underlying populations that have the same mean
     *
     * @param array   $a      First list of float values
     * @param array   $b      Second list of float values
     * @param boolean $paired Logical indicating whether you want a paired t-test
     *
     * @return float Returns the associated with a Student's t-Test
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function tTest ($a, $b, $paired=false)
    {
        if ($paired == true) {
            $count = count($a);

            $diff = array();

            for ($i=0; $i<$count; $i++) {
                $diff[$i] = $a[$i] - $b[$i];
            }

            $mean = $this->mean($diff);
            $var  = $this->variance($diff);

            $t = $mean / sqrt($var / $count);
        } else {
            $meanA = $this->mean($a);
            $meanB = $this->mean($b);

            $varA = $this->variance($a);
            $varB = $this->variance($b);

            $countA = count($a);
            $countB = count($b);

            $t = ($meanA - $meanB) / sqrt(($varA / $countA) + ($varB / $countB));
        }

        return $t;
    }

    /**
     * Returns the standard normal cumulative distribution function. The distribution
     * has a mean of 0 (zero) and a standard deviation of one. Use this function in
     * place of a table of standard normal curve areas.
     *
     * @param float $x    Is the value for which you want the distribution.
     * @param float $mean The distribution mean (default is zero).
     * @param float $sd   The distribution standard deviation (default is one).
     *
     * @return float the standard normal cumulative distribution function.
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function norm ($x, $mean=0, $sd=1)
    {
        $y = (1 / sqrt(2 * pi())) * exp(-0.5 * pow($x, 2));

        return $y;
    }

    private function _zip ($q, $i, $j, $b)
    {
        $zz = 1;
        $z  = $zz;
        $k  = $i;

        while ($k <= $j) {
            $zz *= $q * $k / ($k - $b);
            $z  += $zz;
            $k  += 2;
        }

        return $z;
    }

    /**
     * Returns the Percentage Points (probability) for the Student t-distribution
     * where a numeric value (t) is a calculated value of t for which the Percentage
     * Points are to be computed.
     *
     * @param float   $t    Is the numeric value at which to evaluate the distribution.
     * @param integer $n    Is an integer indicating the number of degrees of freedom.
     * @param integer $tail Specifies the number of distribution tails to return.
     *                      If tail = 1, TDIST returns the one-tailed distribution.
     *                      If tail = 2, TDIST returns the two-tailed distribution.
     *
     * @return float the Percentage Points (probability) for the Student t-distribution
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function tDist ($t, $n, $tail=1)
    {
        $pj2 = pi() / 2;

        $t = abs($t);

        $rt = $t / sqrt($n);
        $fk = atan($rt);

        if ($n == 1) {
            $result = 1 - $fk / $pj2;
        } else {
            $ek = sin($fk);
            $dk = cos($fk);

            if (($n % 2) == 1) {
                $result = 1 - ($fk + $ek * $dk * $this->_zip($dk * $dk, 2, $n-3, -1)) / $pj2;
            } else {
                $result = 1 - $ek * $this->_zip($dk * $dk, 1, $n-3, -1);
            }
        }

        return $result / $tail;
    }

    /**
     * Returns the F probability distribution. You can use this function to determine
     * whether two data sets have different degrees of diversity.
     *
     * @param float   $f   Is the value at which to evaluate the function.
     * @param integer $df1 Is the numerator degrees of freedom.
     * @param integer $df2 Is the denominator degrees of freedom.
     *
     * @return float the F probability distribution
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function fDist ($f, $df1, $df2)
    {
        $pj2 = pi() / 2;

        $x = $df2 / ($df1 * $f + $df2);

        if (($df1 % 2) == 0) {
            return $this->_zip(1 - $x, $df2, $df1 + $df2 - 4, $df2 - 2) * pow($x, $df2 / 2);
        }

        if (($df2 % 2) == 0) {
            return 1 - $this->_zip($x, $df1, $df1 + $df2 - 4, $df1 - 2) * pow(1 - $x, $df1 / 2);
        }

        $tan = atan(sqrt($df1 * $f / $df2));
        $a   = $tan / $pj2;
        $sat = sin($tan);
        $cot = cos($tan);

        if ($df2 > 1) {
            $a = $a + $sat * $cot * $this->_zip($cot * $cot, 2, $df2 - 3, -1) / $pj2;
        }

        if ($df1 == 1) {
            return 1 - $a;
        }

        $c = 4 * $this->_zip($sat * $sat, $df2 + 1, $df1 + $df2 - 4, $df2 - 2) * $sat * pow($cot, $df2) / pi();

        if ($df2 == 1) {
            return 1 - $a + $c / 2;
        }

        $k = 2;

        while ($k <= ($df2 - 1) / 2) {
            $c *= $k / ($k - 0.5);
            $k++;
        }

        return 1 - $a + $c;
    }

    /**
     * Return the probability of normal z value
     * Adapted from a polynomial approximation in:
     *     Ibbetson D, Algorithm 209
     *     Collected Algorithms of the CACM 1963 p. 616
     *
     * @param float $z Is the value at which you want to evaluate the probability.
     *
     * @return float the probability of normal z value
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function normCDF ($z)
    {
        $max = 6;

        if ($z == 0) {
            $x = 0;
        } else {
            $y = abs($z) / 2;
            if ($y >= ($max / 2)) {
                $x = 1;
            } elseif ($y < 1) {
                $w = $y * $y;
                $x = ((((((((0.000124818987 * $w
                          - 0.001075204047) * $w + 0.005198775019) * $w
                          - 0.019198292004) * $w + 0.059054035642) * $w
                          - 0.151968751364) * $w + 0.319152932694) * $w
                          - 0.531923007300) * $w + 0.797884560593) * $y * 2;
            } else {
                $y -= 2;
                $x = (((((((((((((-0.000045255659 * $y
                                + 0.000152529290) * $y - 0.000019538132) * $y
                                - 0.000676904986) * $y + 0.001390604284) * $y
                                - 0.000794620820) * $y - 0.002034254874) * $y
                                + 0.006549791214) * $y - 0.010557625006) * $y
                                + 0.011630447319) * $y - 0.009279453341) * $y
                                + 0.005353579108) * $y - 0.002141268741) * $y
                                + 0.000535310849) * $y + 0.999936657524;
            }
        }

        if ($z > 0) {
            $result = ($x + 1) / 2;
        } else {
            $result = (1 - $x) / 2;
        }

        return $result;
    }

    /**
     * Returns the one-tailed probability of the chi-squared distribution. The chi-squared
     * distribution is associated with a chi-squared test. Use the chi-squared test to
     * compare observed and expected values.
     * Adapted from:
     *     Hill, I. D. and Pike, M. C.  Algorithm 299
     *     Collected Algorithms for the CACM 1967 p. 243
     * Updated for rounding errors based on remark in
     *     ACM TOMS June 1985, page 185
     *
     * @param float   $x  Is the value at which you want to evaluate the distribution.
     * @param integer $df Is the number of degrees of freedom.
     *
     * @return float the probability of the chi-squared distribution
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function chiDist ($x, $df)
    {
        define(LOG_SQRT_PI, log(sqrt(pi())));
        define(I_SQRT_PI, 1 / sqrt(pi()));

        if ($x <= 0 || $df < 1) {
            $result = 1;
        }

        $a = $x / 2;

        if ($df % 2 == 0) {
            $even = true;
        } else {
            $even = false;
        }

        if ($df > 1) {
            $y = exp(-1 * $a);
        }

        if ($even) {
            $s = $y;
        } else {
            $s = 2 * $this->normCDF(-1 * sqrt($x));
        }

        if ($df > 2) {
            $x = ($df - 1) / 2;

            if ($even) {
                $z = 1;
            } else {
                $z = 0.5;
            }

            if ($a > 20) {
                if ($even) {
                    $e = 0;
                } else {
                    $e = LOG_SQRT_PI;
                }

                $c = log($a);

                while ($z <= $x) {
                    $e += log($z);
                    $s += exp($c * $z - $a - $e);
                    $z += 1;
                }

                $result = $s;
            } else {
                if ($even) {
                    $e = 1;
                } else {
                    $e = I_SQRT_PI / sqrt($a);
                }

                $c = 0;

                while ($z <= $x) {
                    $e *= ($a / $z);
                    $c += $e;
                    $z += 1;
                }

                $result = $c * $y + $s;
            }
        } else {
            $result = $s;
        }

        return $result;
    }

    /**
     * Performs chi-squared contingency table tests and goodness-of-fit tests.
     *
     * Example:
     * <code>
     * $table['Automatic'] = array('4 Cylinders' => 3, '6 Cylinders' => 4, '8 Cylinders' => 12);
     * $table['Manual']    = array('4 Cylinders' => 8, '6 Cylinders' => 3, '8 Cylinders' => 2);
     *
     * $results = $stats->chiTest($table);
     * </code>
     *
     * @param array $table Specifies the two-way, n x m table containing the counts
     *
     * @return array [chi] => the value the chi-squared test statistic
     *               [df] => the degrees of freedom of the approximate chi-squared distribution of the test statistic.
     *               [expected] => the expected counts under the null hypothesis.
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function chiTest ($table)
    {
        $total = 0;
        $chi   = 0;

        foreach ($table as $category => $row) {
            foreach ($row as $sample => $cell) {
                if (isset($rows[$category])) {
                    $rows[$category] += $cell;
                } else {
                    $rows[$category] = $cell;
                }

                if (isset($cols[$sample])) {
                    $cols[$sample] += $cell;
                } else {
                    $cols[$sample] = $cell;
                }

                $total += $cell;
            }
        }

        $r  = count($rows);
        $c  = count($cols);
        $df = ($r - 1) * ($c - 1);

        $expected = array();

        foreach ($table as $category => $row) {
            foreach ($row as $sample => $cell) {
                // fo frequency of the observed value
                // fe frequency of the expected value
                $fo  = $cell;
                $fe  = ($rows[$category] * $cols[$sample]) / $total;
                $chi += pow($fo - $fe, 2) / $fe;

                $expected[$category][$sample] = $fe;
            }
        }

        return array('chi'=>$chi, 'df'=>$df, 'expected'=>$expected);
    }

    /**
     * Returns the inverse of the standard normal cumulative distribution. 
     * The distribution has a mean of zero and a standard deviation of one.
     * This is an implementation of the algorithm published at:
     * http://home.online.no/~pjacklam/notes/invnorm/
     *
     * @param float $p Is a probability corresponding to the normal distribution.
     *
     * @return float Inverse of the standard normal cumulative distribution, with a probability of $p
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    function inverseNormCDF($p) {
        /* coefficients for the rational approximants for the normal probit: */
        $a1	= -3.969683028665376e+01;
        $a2	=  2.209460984245205e+02;
        $a3	= -2.759285104469687e+02;
        $a4	=  1.383577518672690e+02;
        $a5	= -3.066479806614716e+01;
        $a6	=  2.506628277459239e+00;
        $b1	= -5.447609879822406e+01;
        $b2	=  1.615858368580409e+02;
        $b3	= -1.556989798598866e+02;
        $b4	=  6.680131188771972e+01;
        $b5	= -1.328068155288572e+01;
        $c1	= -7.784894002430293e-03;
        $c2	= -3.223964580411365e-01;
        $c3	= -2.400758277161838e+00;
        $c4	= -2.549732539343734e+00;
        $c5	=  4.374664141464968e+00;
        $c6	=  2.938163982698783e+00;
        $d1	=  7.784695709041462e-03;
        $d2	=  3.224671290700398e-01;
        $d3	=  2.445134137142996e+00;
        $d4	=  3.754408661907416e+00;

        $p_low  = 0.02425;
        $p_high	= 1.0 - $p_low;

        if(0 < $p && $p < $p_low) {
            /* rational approximation for the lower region */
            $q = sqrt(-2 * log($p));
            $x = ((((($c1 * $q + $c2) * $q + $c3) * $q + $c4) * $q + $c5) * $q + $c6) / (((($d1 * $q + $d2) * $q + $d3) * $q + $d4) * $q + 1);
        } elseif ($p_low <= $p && $p <= $p_high) {
            /* rational approximation for the central region */
            $q = $p - 0.5;
            $r = $q * $q;
            $x = ((((($a1 * $r + $a2) * $r + $a3) * $r + $a4) * $r + $a5) * $r + $a6) * $q / ((((($b1 * $r + $b2) * $r + $b3) * $r + $b4) * $r + $b5) * $r + 1);
        } else {
            /* rational approximation for the upper region */
            $q = sqrt(-2 * log(1 - $p));
            $x = -((((($c1 * $q + $c2) * $q + $c3) * $q + $c4) * $q + $c5) * $q + $c6) / (((($d1 * $q + $d2) * $q + $d3) * $q + $d4) * $q + 1);
        }

        /*
        The relative error of the approximation has
        absolute value less than 1.15 × 10-9.  One iteration of
        Halley's rational method (third order) gives full machine precision.

        if(0 < $p && $p < 1) {
            $e = 0.5 * erfc(-$x / sqrt(2)) - $p;
            $u = $e * sqrt(2 * pi()) * exp($x * $x / 2);
            $x = $x - $u / (1 + $x * $u / 2);
        }
        */

        return $x;
    }

    /**
     * Returns the t-value of the Student's t-distribution as a function of 
     * the probability and the degrees of freedom. This is an implementation 
     * of the algorithm in:
     * G. W. Hill. "Algorithm 396: Student's t-Quantiles." Communications
     * of the ACM 13(10):619--620.  ACM Press, October, 1970.
     *
     * @param float   $p Is the probability associated with the two-tailed Student's t-distribution.
     * @param integer $n Is the number of degrees of freedom with which to characterize the distribution.
     *
     * @return float t-value of the Student's t-distribution for the terms above (i.e. $p and $n).
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    function inverseTCDF($p, $n) {
        if($n == 1) {
            $p     *= M_PI_2;
            $result = cos($p) / sin($p);
        } else {
            $a = 1 / ($n - 0.5);
            $b = 48 / ($a * $a);
            $c = ((20700 * $a / $b - 98) * $a - 16) * $a + 96.36;
            $d = ((94.5 / ($b + $c) - 3) / $b + 1) * sqrt($a * M_PI_2) * $n;
            $y = pow(2 * $d * $p, 2 / $n);
            
            if ($y > (0.05 + $a)) {
                /* asymptotic inverse expansion about the normal */
                $x = $this->inverseNormCDF($p * 0.5);
                $y = $x * $x;

                if ($n < 5) {
                    $c += 0.3 * ($n - 4.5) * ($x + 0.6);
                }

                $c  = (((0.05 * $d * $x - 5) * $x - 7) * $x - 2) * $x + $b + $c;
                $y  = (((((0.4 * $y + 6.3) * $y + 36) * $y + 94.5) / $c - $y - 3) / $b + 1) * $x;
                $y *= $a * $y;

                if ($y > 0.002) {
                    $y = exp($y) - 1;
                } else {
                    $y += 0.5 * $y * $y;
                }
            } else {
                $y = ((1 / ((($n + 6) / ($n * $y) - 0.089 * $d - 0.822) * ($n + 2) * 3) + 0.5 / ($n + 4)) * $y - 1) * ($n + 1) / ($n + 2) + 1 / $y;
            }
            
            $result = sqrt($n * $y);
        }

        return $result;
    }
    
    /**
     * Returns the Shannon or Simpson index value
     * http://en.wikipedia.org/wiki/Diversity_index
     *
     * @param array  $abundances Associated array where keys refer to the categories and values refer to the observation counts in each category.
     * @param string $index      Which index you would like to calculate ["shannon"|"simpson"] (default is "shannon")
     * @param float  $base       Base of the logarithmic transformation used in the Shannon index (default is M_E)
     *
     * @return float The index (either Shannon or Simpson as selected in the $index parameter)
     * @author Khaled Al-Sham'aa <[email protected]>
     */
    public function diversity ($abundances, $index='shannon', $base=M_E)
    {
        $index = strtolower($index);
        $N = array_sum($abundances);

        // Calculate the proportion of characters belonging to each category
        foreach ($abundances as $key=>$value) {
            $P["$key"] = $value / $N;
        }

        $result = 0;

        if ($index == 'shannon') {
            foreach ($P as $key=>$value) {
                $result += $value * log($value, $base);
            }
            $result = -1 * $result;
        } elseif ($index == 'simpson') {
            if ($N < 20) {
                foreach ($abundances as $key=>$value) {
                    $result += $value * ($value - 1);
                }
                $result = $result / ($N * ($N - 1));
            } else {
                foreach ($P as $key=>$value) {
                    $result += $value * $value;
                }
            }
        }

        return $result;
    }
 }
diff --git a/kashi_anova.php b/kashi_anova.php
 <?php
 require_once('kashi.php');

 class KashiANOVA {
    protected $dbh;
    protected $preparedStmt;
    protected $kashi;
    public $persistent = false;

    public function __construct($dbname='test', $dbuser='root', $dbpass='', $dbhost='localhost') {
        $this->kashi = new Kashi();
        
        // We save data in SQL to have a scalable data-analysis solution (ENGINE=MEMORY ?)
        $this->dbh = new PDO("mysql:host=$dbhost;dbname=$dbname", $dbuser, $dbpass);
        
        $tableA = 'CREATE TABLE IF NOT EXISTS kashi_factors (id INT NOT NULL, kashi_factor VARCHAR(50) NOT NULL, kashi_level VARCHAR(50) NOT NULL, PRIMARY KEY (id, kashi_factor))';
        $tableB = 'CREATE TABLE IF NOT EXISTS kashi_variates (id INT NOT NULL, kashi_trait VARCHAR(50) NOT NULL, kashi_value DECIMAL(9,3) NOT NULL, PRIMARY KEY (id, kashi_trait))';
        
        $this->dbh->exec($tableA);
        $this->dbh->exec($tableB);
        
        $this->preparedStmt['insertFactor']  = $this->dbh->prepare("INSERT INTO kashi_factors VALUES (:myRecord, :myName, :myValue)");
        $this->preparedStmt['insertVariate'] = $this->dbh->prepare("INSERT INTO kashi_variates VALUES (:myRecord, :myName, :myValue)");
        $this->preparedStmt['queryFactor']   = $this->dbh->prepare("SELECT id FROM kashi_factors WHERE kashi_factor=:myFactor AND kashi_level=:myLevel");
        $this->preparedStmt['ssBetween']     = $this->dbh->prepare("SELECT kashi_level, AVG(kashi_value) AS kashi_mean, SUM(kashi_value) AS kashi_total, SUM(kashi_value*kashi_value) AS kashi_ss, COUNT(kashi_value) AS kashi_count FROM kashi_factors INNER JOIN kashi_variates USING(id) WHERE kashi_factor=:myTreat AND kashi_trait=:myTrait GROUP BY kashi_level");
    }
 	
 	public function __destruct() {
        $tableA = 'DROP TABLE kashi_factors';
        $tableB = 'DROP TABLE kashi_variates';
        
        if (!$this->persistent) {
            $this->dbh->exec($tableA);
            $this->dbh->exec($tableB);
        }

        $this->dbh = null;
 	}
    
    public function loadArray($data) {
        $this->dbh->exec('TRUNCATE TABLE kashi_factors');
        $this->dbh->exec('TRUNCATE TABLE kashi_variates');

        foreach ($data as $name=>$values) {
            if (substr($name, -1) == '!') {
                $name  = substr($name, 0, -1);
                $table = 'kashi_factors';
                $stmt  = 'insertFactor';
            } else {
                $table = 'kashi_variates';
                $stmt  = 'insertVariate';
            }
            
            $this->preparedStmt[$stmt]->bindParam(':myRecord', $record, PDO::PARAM_INT);
            $this->preparedStmt[$stmt]->bindParam(':myValue', $value, PDO::PARAM_STR);
            $this->preparedStmt[$stmt]->bindParam(':myName', $name, PDO::PARAM_STR);
            
            $record = 1;
            foreach ($values as $value) {
                $this->preparedStmt[$stmt]->execute();    
                $record++;
            }
        }
    }

    protected function readString($str) {
        $data   = array();
        $str    = str_replace("\r", '', $str);
        $lines  = explode("\n", $str);
        $header = explode("\t", $lines[0]);
        $maxCol = count($header);
        $maxRow = count($lines);
        
        for ($i=1; $i<$maxRow; $i++) {
            if (trim($lines[$i]) == '') continue;
            $fields = explode("\t", $lines[$i]);
            for ($j=0; $j<$maxCol; $j++) {
                $data[$header[$j]][] = $fields[$j];
            }            
        }
        
        return $data;
    }
    
    public function loadString($str) {
        $this->loadArray($this->readString($str));
    }
    
    protected function query($factors, $trait, $return='summary') {
        $this->preparedStmt['queryFactor']->bindParam(':myFactor', $factor, PDO::PARAM_STR);
        $this->preparedStmt['queryFactor']->bindParam(':myLevel', $level, PDO::PARAM_STR);
        
        foreach ($factors as $factor=>$level) {
            $this->preparedStmt['queryFactor']->execute();           
            $result = $this->preparedStmt['queryFactor']->fetchAll();            
            $keys   = array();

            foreach ($result as $row) {
                $keys[] = $row['id'];
            }

            if (!isset($activeKeys)) {
                $activeKeys = $keys;
            }
            
            $activeKeys = array_intersect($activeKeys, $keys);
        }

        $data   = array();
        $idList = implode(",", $activeKeys);
        
        if ($return == 'values') {
            $sql    = "SELECT id, kashi_value FROM kashi_variates WHERE kashi_trait='$trait' AND id in ($idList)";
                   
            foreach ($this->dbh->query($sql) as $row) {
                $data[$row['id']] = $row['kashi_value'];
            }
        } else if ($return == 'summary') {
            $sql    = "SELECT SUM(kashi_value) AS a, AVG(kashi_value) AS b, COUNT(kashi_value) AS c FROM kashi_variates WHERE kashi_trait='$trait' AND id in ($idList)";
            $result = $this->dbh->query($sql)->fetch(PDO::FETCH_ASSOC);
            
            $data['sum']   = $result['a'];
            $data['avg']   = $result['b'];
            $data['count'] = $result['c'];
        }

        return $data;
    }
    
    public function anova ($treat, $trait) {
        $sql     = "SELECT COUNT(DISTINCT kashi_level) AS levels FROM kashi_factors WHERE kashi_factor='$treat'";
        $result  = $this->dbh->query($sql)->fetch(PDO::FETCH_ASSOC);
        $r       = $result['levels'];
        $treatDF = $r - 1;
        
        $sql     = "SELECT COUNT(DISTINCT id) AS levels FROM kashi_variates WHERE kashi_trait='$trait'";
        $result  = $this->dbh->query($sql)->fetch(PDO::FETCH_ASSOC);
        $t       = $result['levels'];
        $totalDF = $t - 1;
        
        $errorDF = $totalDF - $treatDF;
        
        $sql        = "SELECT SUM(kashi_value*kashi_value) AS ss, SUM(kashi_value) AS grandTotal, COUNT(kashi_value) AS observations FROM kashi_variates WHERE kashi_trait='$trait'";
        $result     = $this->dbh->query($sql)->fetch(PDO::FETCH_ASSOC);
 		$ss         = $result['ss'];
        $grandTotal = $result['grandTotal'];
        $t          = $result['observations'];
        
        $C          = pow($grandTotal, 2) / $t;
        $SSTot      = $ss - $C;
        $grandMean  = $grandTotal / $t;

        $sql        = "SELECT DISTINCT kashi_level FROM kashi_factors WHERE kashi_factor='$treat'";
        $levels     = array();
        $means      = array();
        $reps       = array();
        $variation  = 0;
        
        foreach ($this->dbh->query($sql) as $row) {
            $levels[] = $row['kashi_level'];
        }
        
        foreach ($levels as $level) {
            $result     = $this->query(array($treat=>$level), $trait, 'summary');
            $variation += pow($result['sum'], 2) / $result['count'];
            
            $means[$level] = $result['avg'];
            $reps[$level]  = $result['count'];
        }
        
        ksort($means);
        ksort($reps);
        
        $SST = $variation - $C;
        $SSE = $SSTot - $SST;
        $MST = $SST / $treatDF;
        $MSE = $SSE / $errorDF;
        $VRT = $MST / $MSE;
        $F   = $this->kashi->fDist($VRT, $treatDF, $errorDF);
        
        if (max($reps) == min($reps)) {
            $SE  = sqrt($MSE / max($reps));
            $SED = sqrt(2) * $SE;
            // qt(0.975, Residual df) 
            $LSD = $this->kashi->inverseTCDF(0.05, $errorDF) * $SED;
        } else {
            $SE  = array();
            $SED = array();
            $LSD = array();

            $SE['min']  = sqrt($MSE / min($reps));
            $SED['min'] = sqrt(2) * $SE['min'];
            $LSD['min'] = $this->kashi->inverseTCDF(0.05, $errorDF) * $SED['min'];

            $SE['max']  = sqrt($MSE / max($reps));
            $SED['max'] = sqrt(2) * $SE['max'];
            $LSD['max'] = $this->kashi->inverseTCDF(0.05, $errorDF) * $SED['max'];
        }
        
 		$CV  = 100 * sqrt($MSE) / $grandMean;
        
 		$anova['TDF']   = $treatDF;
 		$anova['EDF']   = $errorDF;
        $anova['TotDF'] = $totalDF;
 		$anova['SST']   = $SST;
 		$anova['SSE']   = $SSE;
 		$anova['SSTot'] = $SSTot;
 		$anova['MST']   = $MST;
 		$anova['MSE']   = $MSE;
 		$anova['VRT']   = $VRT;
        $anova['F']     = $F;
 		$anova['Mean']  = $grandMean;
 		$anova['Means'] = $means;       
        $anova['Reps']  = $reps;
 		$anova['SE']    = $SE;
 		$anova['SED']   = $SED;
 		$anova['LSD']   = $LSD;
 		$anova['CV']    = $CV;
        
        return $anova;
    }
 }
	model mpg cyl! disp hp drat wt qsec vs am! gear! carb!
	Mazda RX4 21 6 160 110 3.9 2.62 16.46 0 1 4 4
	Mazda RX4 Wag 21 6 160 110 3.9 2.875 17.02 0 1 4 4
	Datsun 710 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
	Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
	Hornet Sportabout 18.7 8 360 175 3.15 3.44 17.02 0 0 3 2
	Valiant 18.1 6 225 105 2.76 3.46 20.22 1 0 3 1
	Duster 360 14.3 8 360 245 3.21 3.57 15.84 0 0 3 4
	Merc 240D 24.4 4 146.7 62 3.69 3.19 20 1 0 4 2
	Merc 230 22.8 4 140.8 95 3.92 3.15 22.9 1 0 4 2
	Merc 280 19.2 6 167.6 123 3.92 3.44 18.3 1 0 4 4
	Merc 280C 17.8 6 167.6 123 3.92 3.44 18.9 1 0 4 4
	Merc 450SE 16.4 8 275.8 180 3.07 4.07 17.4 0 0 3 3
	Merc 450SL 17.3 8 275.8 180 3.07 3.73 17.6 0 0 3 3
	Merc 450SLC 15.2 8 275.8 180 3.07 3.78 18 0 0 3 3
	Cadillac Fleetwood 10.4 8 472 205 2.93 5.25 17.98 0 0 3 4
	Lincoln Continental 10.4 8 460 215 3 5.424 17.82 0 0 3 4
	Chrysler Imperial 14.7 8 440 230 3.23 5.345 17.42 0 0 3 4
	Fiat 128 32.4 4 78.7 66 4.08 2.2 19.47 1 1 4 1
	Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
	Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.9 1 1 4 1
	Toyota Corona 21.5 4 120.1 97 3.7 2.465 20.01 1 0 3 1
	Dodge Challenger 15.5 8 318 150 2.76 3.52 16.87 0 0 3 2
	AMC Javelin 15.2 8 304 150 3.15 3.435 17.3 0 0 3 2
	Camaro Z28 13.3 8 350 245 3.73 3.84 15.41 0 0 3 4
	Pontiac Firebird 19.2 8 400 175 3.08 3.845 17.05 0 0 3 2
	Fiat X1-9 27.3 4 79 66 4.08 1.935 18.9 1 1 4 1
	Porsche 914-2 26 4 120.3 91 4.43 2.14 16.7 0 1 5 2
	Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.9 1 1 5 2
	Ford Pantera L 15.8 8 351 264 4.22 3.17 14.5 0 1 5 4
	Ferrari Dino 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6
	Maserati Bora 15 8 301 335 3.54 3.57 14.6 0 1 5 8
	Volvo 142E 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2
	model mpg cyl disp hp drat wt qsec vs am gear carb
	Mazda RX4 21 6 160 110 3.9 2.62 16.46 0 1 4 4
	Mazda RX4 Wag 21 6 160 110 3.9 2.875 17.02 0 1 4 4
	Datsun 710 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
	Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
	Hornet Sportabout 18.7 8 360 175 3.15 3.44 17.02 0 0 3 2
	Valiant 18.1 6 225 105 2.76 3.46 20.22 1 0 3 1
	Duster 360 14.3 8 360 245 3.21 3.57 15.84 0 0 3 4
	Merc 240D 24.4 4 146.7 62 3.69 3.19 20 1 0 4 2
	Merc 230 22.8 4 140.8 95 3.92 3.15 22.9 1 0 4 2
	Merc 280 19.2 6 167.6 123 3.92 3.44 18.3 1 0 4 4
	Merc 280C 17.8 6 167.6 123 3.92 3.44 18.9 1 0 4 4
	Merc 450SE 16.4 8 275.8 180 3.07 4.07 17.4 0 0 3 3
	Merc 450SL 17.3 8 275.8 180 3.07 3.73 17.6 0 0 3 3
	Merc 450SLC 15.2 8 275.8 180 3.07 3.78 18 0 0 3 3
	Cadillac Fleetwood 10.4 8 472 205 2.93 5.25 17.98 0 0 3 4
	Lincoln Continental 10.4 8 460 215 3 5.424 17.82 0 0 3 4
	Chrysler Imperial 14.7 8 440 230 3.23 5.345 17.42 0 0 3 4
	Fiat 128 32.4 4 78.7 66 4.08 2.2 19.47 1 1 4 1
	Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
	Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.9 1 1 4 1
	Toyota Corona 21.5 4 120.1 97 3.7 2.465 20.01 1 0 3 1
	Dodge Challenger 15.5 8 318 150 2.76 3.52 16.87 0 0 3 2
	AMC Javelin 15.2 8 304 150 3.15 3.435 17.3 0 0 3 2
	Camaro Z28 13.3 8 350 245 3.73 3.84 15.41 0 0 3 4
	Pontiac Firebird 19.2 8 400 175 3.08 3.845 17.05 0 0 3 2
	Fiat X1-9 27.3 4 79 66 4.08 1.935 18.9 1 1 4 1
	Porsche 914-2 26 4 120.3 91 4.43 2.14 16.7 0 1 5 2
	Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.9 1 1 5 2
	Ford Pantera L 15.8 8 351 264 4.22 3.17 14.5 0 1 5 4
	Ferrari Dino 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6
	Maserati Bora 15 8 301 335 3.54 3.57 14.6 0 1 5 8
	Volvo 142E 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2
	<?php

	$sep = "\t"; $nl = "\n";

	$content = file_get_contents('data.txt');

	$records = explode($nl, $content);
	$header = explode($sep, trim(array_shift($records)));
	$data = array_fill_keys($header, array());

	foreach ($records as $id=>$record) {
	$record = trim($record);
	$fields = explode($sep, $record);
	$titles = $header;

	foreach ($fields as $field) {
	$title = array_shift($titles);
	$data[$title][] = $field;
	}
	}

	$x = $data['wt'];
	$y = $data['mpg'];

	require('kashi.php');

	// $kashi = new Kashi($dbname, $dbuser, $dbpass, $dbhost);
	$kashi = new Kashi('test', 'root', '', 'localhost');

	/**
	* Summary Statistics:
	*/

	// $x is an array of values
	echo 'Mean: ' . $kashi->mean($x) . '<br />';

	// It will use previous data set if you don't provide one as an argument to the method
	echo 'Mode: ' . print_r($kashi->mode()) . '<br />';
	echo 'Median: ' . $kashi->median() . '<br />';
	echo 'Variance: ' . $kashi->variance() . '<br />';
	echo 'SD: ' . $kashi->sd() . '<br />';
	echo '%CV: ' . $kashi->cv() . '<br />';
	echo 'Skewness: ' . $kashi->skew() . '<br />';
	echo 'Kurtosis: ' . $kashi->kurt() . '<br />';

	/**
	* Correlation, Regression, and t-Test:
	*/
	echo 'Covariance: ' . $kashi->cov($x, $y) . '<br />';
	echo 'Correlation: ' . $kashi->cor($x, $y) . '<br />';

	$r = $kashi->cor($x, $y);
	$n = count($x);
	echo 'Significant of Correlation: ' . $kashi->corTest($r, $n) . '<br />';

	echo 'Regression: ' . print_r($kashi->lm($x, $y), true) . '<br />';

	echo 't-Test unpaired: ' . $kashi->tTest($x, $y, false) . '<br />';
	echo 'Test: ' . $kashi->tDist($kashi->tTest($x, $y, false), (count($x)-1)*(count($y)-1)) . '<br />';

	echo 't-Test paired: ' . $kashi->tTest($x, $y, true) . '<br />';
	echo 'Test: ' . $kashi->tDist($kashi->tTest($x, $y, true), count($x)-1) . '<br />';

	/**
	* Distributions:
	*/
	echo 'Normal distribution (x=0.5, mean=0, sd=1): ' . $kashi->norm(0.5, 0, 1) . '<br />';

	echo 'Probability for the Student t-distribution (t=3, n=10) one-tailed: ';
	echo $kashi->tDist(3, 10, 1) . '<br />';

	echo 'Probability for the Student t-distribution (t=3, n=10) two-tailed: ';
	echo $kashi->tDist(3, 10, 2) . '<br />';

	echo 'F probability distribution (f=2, df1=12, df2=15): ' . $kashi->fDist(2, 12, 15) . '<br />';

	echo 'Inverse of the standard normal cumulative distribution (p=0.95): ';
	echo $kashi->inverseNormCDF(0.95) . '<br />';

	echo 't-value of the Student\'s t-distribution (p=0.05, n=29): ';
	echo $kashi->inverseTCDF(0.05, 29) . '<br />';

	/**
	* Chi-square test or Contingency tables (A/B testing):
	*/
	$table['Automatic'] = array('4 Cylinders' => 3, '6 Cylinders' => 4, '8 Cylinders' => 12);
	$table['Manual'] = array('4 Cylinders' => 8, '6 Cylinders' => 3, '8 Cylinders' => 2);

	$result = $kashi->chiTest($table);

	$probability = $kashi->chiDist($result['chi'], $result['df']);
	echo 'Chi-square test probability: ' . $probability . '<br />';

	/**
	* Diversity index:
	*/
	$gear = array('3' => 15, '4' => 12, '5' => 5);
	$cyl = array('4' => 11, '6' => 7, '8' => 14);

	echo 'Shannon index for gear: ' . $kashi->diversity($gear) . '<br />';
	echo 'Shannon index for cyl: ' . $kashi->diversity($cyl) . '<br />';

	/**
	* ANOVA:
	*/
	require('kashi_anova.php');

	// $obj = new KashiANOVA($dbname, $dbuser, $dbpass, $dbhost);
	$obj = new KashiANOVA('test', 'root', '', 'localhost');

	$str = file_get_contents('anova_data.txt');
	$obj->loadString($str);

	// mpg ~ cyl
	$result = $obj->anova('cyl', 'mpg');
	echo '<hr />Analysis of variance (ANOVA): mpg ~ cyl<pre>';
	print_r($result);
	echo '</pre>';
	<?php
	require_once('kashi.php');

	class KashiANOVA {
	protected $dbh;
	protected $preparedStmt;
	protected $kashi;
	public $persistent = false;

	public function __construct($dbname='test', $dbuser='root', $dbpass='', $dbhost='localhost') {
	$this->kashi = new Kashi();

	// We save data in SQL to have a scalable data-analysis solution (ENGINE=MEMORY ?)
	$this->dbh = new PDO("mysql:host=$dbhost;dbname=$dbname", $dbuser, $dbpass);

	$tableA = 'CREATE TABLE IF NOT EXISTS kashi_factors (id INT NOT NULL, kashi_factor VARCHAR(50) NOT NULL, kashi_level VARCHAR(50) NOT NULL, PRIMARY KEY (id, kashi_factor))';
	$tableB = 'CREATE TABLE IF NOT EXISTS kashi_variates (id INT NOT NULL, kashi_trait VARCHAR(50) NOT NULL, kashi_value DECIMAL(9,3) NOT NULL, PRIMARY KEY (id, kashi_trait))';

	$this->dbh->exec($tableA);
	$this->dbh->exec($tableB);

	$this->preparedStmt['insertFactor'] = $this->dbh->prepare("INSERT INTO kashi_factors VALUES (:myRecord, :myName, :myValue)");
	$this->preparedStmt['insertVariate'] = $this->dbh->prepare("INSERT INTO kashi_variates VALUES (:myRecord, :myName, :myValue)");
	$this->preparedStmt['queryFactor'] = $this->dbh->prepare("SELECT id FROM kashi_factors WHERE kashi_factor=:myFactor AND kashi_level=:myLevel");
	$this->preparedStmt['ssBetween'] = $this->dbh->prepare("SELECT kashi_level, AVG(kashi_value) AS kashi_mean, SUM(kashi_value) AS kashi_total, SUM(kashi_value*kashi_value) AS kashi_ss, COUNT(kashi_value) AS kashi_count FROM kashi_factors INNER JOIN kashi_variates USING(id) WHERE kashi_factor=:myTreat AND kashi_trait=:myTrait GROUP BY kashi_level");
	}

	public function __destruct() {
	$tableA = 'DROP TABLE kashi_factors';
	$tableB = 'DROP TABLE kashi_variates';

	if (!$this->persistent) {
	$this->dbh->exec($tableA);
	$this->dbh->exec($tableB);
	}

	$this->dbh = null;
	}

	public function loadArray($data) {
	$this->dbh->exec('TRUNCATE TABLE kashi_factors');
	$this->dbh->exec('TRUNCATE TABLE kashi_variates');

	foreach ($data as $name=>$values) {
	if (substr($name, -1) == '!') {
	$name = substr($name, 0, -1);
	$table = 'kashi_factors';
	$stmt = 'insertFactor';
	} else {
	$table = 'kashi_variates';
	$stmt = 'insertVariate';
	}

	$this->preparedStmt[$stmt]->bindParam(':myRecord', $record, PDO::PARAM_INT);
	$this->preparedStmt[$stmt]->bindParam(':myValue', $value, PDO::PARAM_STR);
	$this->preparedStmt[$stmt]->bindParam(':myName', $name, PDO::PARAM_STR);

	$record = 1;
	foreach ($values as $value) {
	$this->preparedStmt[$stmt]->execute();
	$record++;
	}
	}
	}

	protected function readString($str) {
	$data = array();
	$str = str_replace("\r", '', $str);
	$lines = explode("\n", $str);
	$header = explode("\t", $lines[0]);
	$maxCol = count($header);
	$maxRow = count($lines);

	for ($i=1; $i<$maxRow; $i++) {
	if (trim($lines[$i]) == '') continue;
	$fields = explode("\t", $lines[$i]);
	for ($j=0; $j<$maxCol; $j++) {
	$data[$header[$j]][] = $fields[$j];
	}
	}

	return $data;
	}

	public function loadString($str) {
	$this->loadArray($this->readString($str));
	}

	protected function query($factors, $trait, $return='summary') {
	$this->preparedStmt['queryFactor']->bindParam(':myFactor', $factor, PDO::PARAM_STR);
	$this->preparedStmt['queryFactor']->bindParam(':myLevel', $level, PDO::PARAM_STR);

	foreach ($factors as $factor=>$level) {
	$this->preparedStmt['queryFactor']->execute();
	$result = $this->preparedStmt['queryFactor']->fetchAll();
	$keys = array();

	foreach ($result as $row) {
	$keys[] = $row['id'];
	}

	if (!isset($activeKeys)) {
	$activeKeys = $keys;
	}

	$activeKeys = array_intersect($activeKeys, $keys);
	}

	$data = array();
	$idList = implode(",", $activeKeys);

	if ($return == 'values') {
	$sql = "SELECT id, kashi_value FROM kashi_variates WHERE kashi_trait='$trait' AND id in ($idList)";

	foreach ($this->dbh->query($sql) as $row) {
	$data[$row['id']] = $row['kashi_value'];
	}
	} else if ($return == 'summary') {
	$sql = "SELECT SUM(kashi_value) AS a, AVG(kashi_value) AS b, COUNT(kashi_value) AS c FROM kashi_variates WHERE kashi_trait='$trait' AND id in ($idList)";
	$result = $this->dbh->query($sql)->fetch(PDO::FETCH_ASSOC);

	$data['sum'] = $result['a'];
	$data['avg'] = $result['b'];
	$data['count'] = $result['c'];
	}

	return $data;
	}

	public function anova ($treat, $trait) {
	$sql = "SELECT COUNT(DISTINCT kashi_level) AS levels FROM kashi_factors WHERE kashi_factor='$treat'";
	$result = $this->dbh->query($sql)->fetch(PDO::FETCH_ASSOC);
	$r = $result['levels'];
	$treatDF = $r - 1;

	$sql = "SELECT COUNT(DISTINCT id) AS levels FROM kashi_variates WHERE kashi_trait='$trait'";
	$result = $this->dbh->query($sql)->fetch(PDO::FETCH_ASSOC);
	$t = $result['levels'];
	$totalDF = $t - 1;

	$errorDF = $totalDF - $treatDF;

	$sql = "SELECT SUM(kashi_value*kashi_value) AS ss, SUM(kashi_value) AS grandTotal, COUNT(kashi_value) AS observations FROM kashi_variates WHERE kashi_trait='$trait'";
	$result = $this->dbh->query($sql)->fetch(PDO::FETCH_ASSOC);
	$ss = $result['ss'];
	$grandTotal = $result['grandTotal'];
	$t = $result['observations'];

	$C = pow($grandTotal, 2) / $t;
	$SSTot = $ss - $C;
	$grandMean = $grandTotal / $t;

	$sql = "SELECT DISTINCT kashi_level FROM kashi_factors WHERE kashi_factor='$treat'";
	$levels = array();
	$means = array();
	$reps = array();
	$variation = 0;

	foreach ($this->dbh->query($sql) as $row) {
	$levels[] = $row['kashi_level'];
	}

	foreach ($levels as $level) {
	$result = $this->query(array($treat=>$level), $trait, 'summary');
	$variation += pow($result['sum'], 2) / $result['count'];

	$means[$level] = $result['avg'];
	$reps[$level] = $result['count'];
	}

	ksort($means);
	ksort($reps);

	$SST = $variation - $C;
	$SSE = $SSTot - $SST;
	$MST = $SST / $treatDF;
	$MSE = $SSE / $errorDF;
	$VRT = $MST / $MSE;
	$F = $this->kashi->fDist($VRT, $treatDF, $errorDF);

	if (max($reps) == min($reps)) {
	$SE = sqrt($MSE / max($reps));
	$SED = sqrt(2) * $SE;
	// qt(0.975, Residual df)
	$LSD = $this->kashi->inverseTCDF(0.05, $errorDF) * $SED;
	} else {
	$SE = array();
	$SED = array();
	$LSD = array();

	$SE['min'] = sqrt($MSE / min($reps));
	$SED['min'] = sqrt(2) * $SE['min'];
	$LSD['min'] = $this->kashi->inverseTCDF(0.05, $errorDF) * $SED['min'];

	$SE['max'] = sqrt($MSE / max($reps));
	$SED['max'] = sqrt(2) * $SE['max'];
	$LSD['max'] = $this->kashi->inverseTCDF(0.05, $errorDF) * $SED['max'];
	}

	$CV = 100 * sqrt($MSE) / $grandMean;

	$anova['TDF'] = $treatDF;
	$anova['EDF'] = $errorDF;
	$anova['TotDF'] = $totalDF;
	$anova['SST'] = $SST;
	$anova['SSE'] = $SSE;
	$anova['SSTot'] = $SSTot;
	$anova['MST'] = $MST;
	$anova['MSE'] = $MSE;
	$anova['VRT'] = $VRT;
	$anova['F'] = $F;
	$anova['Mean'] = $grandMean;
	$anova['Means'] = $means;
	$anova['Reps'] = $reps;
	$anova['SE'] = $SE;
	$anova['SED'] = $SED;
	$anova['LSD'] = $LSD;
	$anova['CV'] = $CV;

	return $anova;
	}
	}