/** * Sum of squares deviations * * ∑⟮xᵢ - μ⟯² * * @param array $numbers * * @return number */ public static function sumOfSquaresDeviations(array $numbers) { if (empty($numbers)) { return null; } $μ = Average::mean($numbers); $∑⟮xᵢ − μ⟯² = array_sum(array_map(function ($xᵢ) use($μ) { return pow($xᵢ - $μ, 2); }, $numbers)); return $∑⟮xᵢ − μ⟯²; }
/** * Sample covariance * A measure of how much two random variables change together. * Average product of their deviations from their respective means. * The population covariance is defined in terms of the sample means x, y * https://en.wikipedia.org/wiki/Covariance * * cov(X, Y) = Sxy = E[⟮X - x⟯⟮Y - y⟯] * * ∑⟮xᵢ - x⟯⟮yᵢ - y⟯ * cov(X, Y) = Sxy = --------------- * n - 1 * * @param array $X values for random variable X * @param array $Y values for random variable Y * * @return number * * @throws BadDataException if X and Y do not have the same number of elements */ public static function sampleCovariance(array $X, array $Y) { if (count($X) !== count($Y)) { throw new Exception\BadDataException('X and Y must have the same number of elements.'); } $x = Average::mean($X); $y = Average::mean($Y); $∑⟮xᵢ − x⟯⟮yᵢ − y⟯ = array_sum(array_map(function ($xᵢ, $yᵢ) use($x, $y) { return ($xᵢ - $x) * ($yᵢ - $y); }, $X, $Y)); $n = count($X); return $∑⟮xᵢ − x⟯⟮yᵢ − y⟯ / ($n - 1); }
/** * Two-way ANOVA * Examines the influence of two different categorical independent variables on * one continuous dependent variable. The two-way ANOVA not only aims at assessing * the main effect of each independent variable but also if there is any interaction * between them (using the F distribution). * https://en.wikipedia.org/wiki/Two-way_analysis_of_variance * * Produces the following analysis of the data: * * ANOVA hypothesis test summary data * * | SS | df | MS | F | P | * Factor A | | | | | | * Factor B | | | | | | * Interaction | | | | | | * Error | | | | * Total | | | * * where: * Interaction = Factor A X Factor B working together * Error is within groups * SS = Sum of squares * df = Degrees of freedom * MS = Mean squares * F = F statistic * P = P value * * Data summary tables for: * Factor A * Factor B * Factor AB (Interaction) * Total * * | N | Sum | Mean | SS | Variance | SD | SEM | * 0 | | | | | | | | * 1 | | | | | | | | * ... | | | | | | | | * Total | | | | | | | | * * where: * Each row is the summary for a sample, numbered from 0 to m - 1 * m = Number of samples * N = Sample size * SS = Sum of squares * SD = Standard deviation * SEM = Standard error of the mean * * Calculations * * Sum of Squares * SST (sum of squares total) * ∑⟮xᵢ − μ⟯² * where: * xᵢ = each element of all samples * μ = mean total of all elements of all samples * * SSA, SSB (sum of squares for each factor A and B) * ∑n(x - μ)² * where: * n = sample size * x = sample mean * μ = mean total of all elements of all samples * * SSW (sum of squares within - error) * ∑∑⟮x − μ⟯² Sum of sum of squared deviations of each sample * where: * x = mean of each AB * μ = mean of the sample * * SSAB (sum of squares AB - interaction) * SSAB = SST - SSA - SSB - SSW; * * Degrees of Freedom * dfT (degrees of freedom for the total) * n - 1 * * dfA (degrees of freedom factor A) * r - 1 * * dfB (degrees of freedom factor B) * c - 1 * * dfAB (degrees of freedom factor AB - interaction) * (r - 1)(c - 1) * * dfW (degrees of freedom within - error) * n - rc * * where: * n = number of samples * r = number of rows (number of factor As) * c = number of columns (number of factor Bs) * * Mean Squares * MSA (Mean squares factor A) * SSA / dfA * * MSB (Mean squares factor B) * SSB / dfB * * MSAB (Mean squares factor AB - interaction) * SSAB / dfAB * * MSW (Mean squares within - error) * SSW / dfW * * F Test Statistics * FA = MSA / MSW * FB = MSB / MSW * FAB = MSAB / MSW * * P values * PA = F distribution CDF above FA with degrees of freedom dfA and dfW * PB = F distribution CDF above FB with degrees of freedom dfA and dfW * PAB = F distribution CDF above FAB with degrees of freedom dfAB and dfW * * Example input data for ...$data parameter: * | Factor B₁ | Factor B₂ | ⋯ * Factor A₁ | 4, 6, 8 | 6, 6, 9 | ⋯ * Factor A₂ | 4, 8, 9 | 7, 10, 13 | ⋯ * ⋮ ⋮ ⋮ ⋮ * @param array ...$data Samples to analyze [ * // Factor A₁ * [ * [4, 6, 8] // Factor B₁ * [6, 6, 9] // Factor B₂ * ⋮ * ], * // Factor A₂ * [ * [4, 8, 9] // Factor B₁ * [7, 10, 13] // Factor B₂ * ⋮ * ], * ... * ] * * @return array [ * ANOVA => [ * factorA => [SS, df, MS, F, P], * factorB => [SS, df, MS, F, P], * factorAB => [SS, df, MS, F, P], * error => [SS, df, MS], * total => [SS, df], * ], * total_summary => [n, sum, mean, SS, variance, sd, sem], * summary_factorA => [ * 0 => [n, sum, mean, SS, variance, sd, sem], * 1 => [n, sum, mean, SS, variance, sd, sem], * ... * ], * summary_factorB => [ * 0 => [n, sum, mean, SS, variance, sd, sem], * 1 => [n, sum, mean, SS, variance, sd, sem], * ... * ], * summary_factorAB => [ * 0 => [n, sum, mean, SS, variance, sd, sem], * 1 => [n, sum, mean, SS, variance, sd, sem], * ... * ] * ] * @throws BadDataException if less than two A factors, or if B factors or values have different number elements */ public static function twoWay(array ...$data) { // Must have at least two rows (two types of factor A) $r = count($data); if ($r < 2) { throw new Exception\BadDataException('Must have at least two rows (two types of factor A)'); } // All samples must have the same number the second factor B $c = count($data[0]); for ($i = 1; $i < $r; $i++) { if (count($data[$i]) !== $c) { throw new Exception\BadDataException('All samples must have the same number of the second factor B'); } } // Each AB factor interaction must have the same number of values $v = count($data[0][0]); for ($i = 0; $i < $r; $i++) { for ($j = 0; $j < $c; $j++) { if (count($data[$i][$j]) !== $v) { throw new Exception\BadDataException('Each AB factor interaction must have the same number of values'); } } } // Aggregates for all elements, rows (factor A), and columns (factor B) $all_elements = []; $A_elements = []; $B_elements = []; // Summaries for factor A, factor B, AB, and total $summary_A = []; $summary_B = []; $summary_AB = []; $summary_total = []; // Summary data for each AB // And aggregate all elements and elements for factor A foreach ($data as $A => $Bs) { $A_elements[$A] = []; foreach ($Bs as $B => $values) { // Aggregates $all_elements = array_merge($all_elements, $values); $A_elements[$A] = array_merge($A_elements[$A], $values); // AB summary $summary_AB[$A][$B] = []; $summary_AB[$A][$B]['n'] = $c; $summary_AB[$A][$B]['sum'] = array_sum($values); $summary_AB[$A][$B]['mean'] = Average::mean($values); $summary_AB[$A][$B]['SS'] = RandomVariable::sumOfSquares($values); $summary_AB[$A][$B]['variance'] = Descriptive::sampleVariance($values); $summary_AB[$A][$B]['sd'] = Descriptive::sd($values); $summary_AB[$A][$B]['sem'] = RandomVariable::standardErrorOfTheMean($values); } } // Aggregate elements for factor B for ($B = 0; $B < $c; $B++) { $B_elements[$B] = []; foreach ($data as $factor1s) { $B_elements[$B] = array_merge($B_elements[$B], $factor1s[$B]); } } // Factor A summary foreach ($A_elements as $A => $elements) { $summary_A[$A] = []; $summary_A[$A]['n'] = count($elements); $summary_A[$A]['sum'] = array_sum($elements); $summary_A[$A]['mean'] = Average::mean($elements); $summary_A[$A]['SS'] = RandomVariable::sumOfSquares($elements); $summary_A[$A]['variance'] = Descriptive::sampleVariance($elements); $summary_A[$A]['sd'] = Descriptive::sd($elements); $summary_A[$A]['sem'] = RandomVariable::standardErrorOfTheMean($elements); } // Factor B summary foreach ($B_elements as $B => $elements) { $summary_B[$B] = []; $summary_B[$B]['n'] = count($elements); $summary_B[$B]['sum'] = array_sum($elements); $summary_B[$B]['mean'] = Average::mean($elements); $summary_B[$B]['SS'] = RandomVariable::sumOfSquares($elements); $summary_B[$B]['variance'] = Descriptive::sampleVariance($elements); $summary_B[$B]['sd'] = Descriptive::sd($elements); $summary_B[$B]['sem'] = RandomVariable::standardErrorOfTheMean($elements); } // Totals summary $μ = Average::mean($all_elements); $summary_total = ['n' => count($all_elements), 'sum' => array_sum($all_elements), 'mean' => $μ, 'SS' => RandomVariable::sumOfSquares($all_elements), 'variance' => Descriptive::sampleVariance($all_elements), 'sd' => Descriptive::sd($all_elements), 'sem' => RandomVariable::standardErrorOfTheMean($all_elements)]; // Sum of squares factor A $SSA = array_sum(array_map(function ($f1) use($μ) { return $f1['n'] * ($f1['mean'] - $μ) ** 2; }, $summary_A)); // Sum of squares factor B $SSB = array_sum(array_map(function ($B) use($μ) { return $B['n'] * ($B['mean'] - $μ) ** 2; }, $summary_B)); // Sum of squares within (error) $SSW = 0; foreach ($data as $A => $Bs) { foreach ($Bs as $B => $values) { foreach ($values as $value) { $SSW += ($value - $summary_AB[$A][$B]['mean']) ** 2; } } } // Sum of squares total $SST = 0; foreach ($data as $A => $Bs) { foreach ($Bs as $B => $values) { foreach ($values as $value) { $SST += ($value - $μ) ** 2; } } } // Sum of squares AB interaction $SSAB = $SST - $SSA - $SSB - $SSW; // Degrees of freedom $dfA = $r - 1; $dfB = $c - 1; $dfAB = ($r - 1) * ($c - 1); $dfW = $summary_total['n'] - $r * $c; $dfT = $summary_total['n'] - 1; // Mean squares $MSA = $SSA / $dfA; $MSB = $SSB / $dfB; $MSAB = $SSAB / $dfAB; $MSW = $SSW / $dfW; // F test statistics $FA = $MSA / $MSW; $FB = $MSB / $MSW; $FAB = $MSAB / $MSW; // P values $PA = F::above($FA, $dfA, $dfW); $PB = F::above($FB, $dfB, $dfW); $PAB = F::above($FAB, $dfAB, $dfW); // Return ANOVA report return ['ANOVA' => ['factorA' => ['SS' => $SSA, 'df' => $dfA, 'MS' => $MSA, 'F' => $FA, 'P' => $PA], 'factorB' => ['SS' => $SSB, 'df' => $dfB, 'MS' => $MSB, 'F' => $FB, 'P' => $PB], 'interaction' => ['SS' => $SSAB, 'df' => $dfAB, 'MS' => $MSAB, 'F' => $FAB, 'P' => $PAB], 'error' => ['SS' => $SSW, 'df' => $dfW, 'MS' => $MSW], 'total' => ['SS' => $SST, 'df' => $dfT]], 'total_summary' => $summary_total, 'summary_factorA' => $summary_A, 'summary_factorB' => $summary_B, 'summary_interaction' => $summary_AB]; }
/** * Get a report of all the descriptive statistics over a list of numbers * Includes mean, median, mode, range, midrange, variance, standard deviation, quartiles, etc. * * @param array $numbers * @param bool $population: true means all possible observations of the system are present; * false means a sample is used. * @return array [ n, mean, median, mode, range, midrange, variance, sd, CV, mean_mad, * median_mad, quartiles, skewness, kurtosis, sem, ci_95, ci_99 ] */ public static function describe(array $numbers, bool $population = false) : array { $n = count($numbers); $μ = Average::mean($numbers); $σ = self::standardDeviation($numbers, $population); return ['n' => $n, 'min' => min($numbers), 'max' => max($numbers), 'mean' => $μ, 'median' => Average::median($numbers), 'mode' => Average::mode($numbers), 'range' => self::range($numbers), 'midrange' => self::midrange($numbers), 'variance' => $population ? self::populationVariance($numbers) : self::sampleVariance($numbers), 'sd' => $σ, 'cv' => $σ / $μ, 'mean_mad' => self::meanAbsoluteDeviation($numbers), 'median_mad' => self::medianAbsoluteDeviation($numbers), 'quartiles' => self::quartiles($numbers), 'midhinge' => self::midhinge($numbers), 'skewness' => $population ? RandomVariable::populationSkewness($numbers) : RandomVariable::skewness($numbers), 'ses' => RandomVariable::SES($n), 'kurtosis' => RandomVariable::kurtosis($numbers), 'sek' => RandomVariable::SEK($n), 'sem' => RandomVariable::standardErrorOfTheMean($numbers), 'ci_95' => RandomVariable::confidenceInterval($μ, $n, $σ, 95), 'ci_99' => RandomVariable::confidenceInterval($μ, $n, $σ, 99)]; }