/** * Two-way ANOVA * Examines the influence of two different categorical independent variables on * one continuous dependent variable. The two-way ANOVA not only aims at assessing * the main effect of each independent variable but also if there is any interaction * between them (using the F distribution). * https://en.wikipedia.org/wiki/Two-way_analysis_of_variance * * Produces the following analysis of the data: * * ANOVA hypothesis test summary data * * | SS | df | MS | F | P | * Factor A | | | | | | * Factor B | | | | | | * Interaction | | | | | | * Error | | | | * Total | | | * * where: * Interaction = Factor A X Factor B working together * Error is within groups * SS = Sum of squares * df = Degrees of freedom * MS = Mean squares * F = F statistic * P = P value * * Data summary tables for: * Factor A * Factor B * Factor AB (Interaction) * Total * * | N | Sum | Mean | SS | Variance | SD | SEM | * 0 | | | | | | | | * 1 | | | | | | | | * ... | | | | | | | | * Total | | | | | | | | * * where: * Each row is the summary for a sample, numbered from 0 to m - 1 * m = Number of samples * N = Sample size * SS = Sum of squares * SD = Standard deviation * SEM = Standard error of the mean * * Calculations * * Sum of Squares * SST (sum of squares total) * ∑⟮xᵢ − μ⟯² * where: * xᵢ = each element of all samples * μ = mean total of all elements of all samples * * SSA, SSB (sum of squares for each factor A and B) * ∑n(x - μ)² * where: * n = sample size * x = sample mean * μ = mean total of all elements of all samples * * SSW (sum of squares within - error) * ∑∑⟮x − μ⟯² Sum of sum of squared deviations of each sample * where: * x = mean of each AB * μ = mean of the sample * * SSAB (sum of squares AB - interaction) * SSAB = SST - SSA - SSB - SSW; * * Degrees of Freedom * dfT (degrees of freedom for the total) * n - 1 * * dfA (degrees of freedom factor A) * r - 1 * * dfB (degrees of freedom factor B) * c - 1 * * dfAB (degrees of freedom factor AB - interaction) * (r - 1)(c - 1) * * dfW (degrees of freedom within - error) * n - rc * * where: * n = number of samples * r = number of rows (number of factor As) * c = number of columns (number of factor Bs) * * Mean Squares * MSA (Mean squares factor A) * SSA / dfA * * MSB (Mean squares factor B) * SSB / dfB * * MSAB (Mean squares factor AB - interaction) * SSAB / dfAB * * MSW (Mean squares within - error) * SSW / dfW * * F Test Statistics * FA = MSA / MSW * FB = MSB / MSW * FAB = MSAB / MSW * * P values * PA = F distribution CDF above FA with degrees of freedom dfA and dfW * PB = F distribution CDF above FB with degrees of freedom dfA and dfW * PAB = F distribution CDF above FAB with degrees of freedom dfAB and dfW * * Example input data for ...$data parameter: * | Factor B₁ | Factor B₂ | ⋯ * Factor A₁ | 4, 6, 8 | 6, 6, 9 | ⋯ * Factor A₂ | 4, 8, 9 | 7, 10, 13 | ⋯ * ⋮ ⋮ ⋮ ⋮ * @param array ...$data Samples to analyze [ * // Factor A₁ * [ * [4, 6, 8] // Factor B₁ * [6, 6, 9] // Factor B₂ * ⋮ * ], * // Factor A₂ * [ * [4, 8, 9] // Factor B₁ * [7, 10, 13] // Factor B₂ * ⋮ * ], * ... * ] * * @return array [ * ANOVA => [ * factorA => [SS, df, MS, F, P], * factorB => [SS, df, MS, F, P], * factorAB => [SS, df, MS, F, P], * error => [SS, df, MS], * total => [SS, df], * ], * total_summary => [n, sum, mean, SS, variance, sd, sem], * summary_factorA => [ * 0 => [n, sum, mean, SS, variance, sd, sem], * 1 => [n, sum, mean, SS, variance, sd, sem], * ... * ], * summary_factorB => [ * 0 => [n, sum, mean, SS, variance, sd, sem], * 1 => [n, sum, mean, SS, variance, sd, sem], * ... * ], * summary_factorAB => [ * 0 => [n, sum, mean, SS, variance, sd, sem], * 1 => [n, sum, mean, SS, variance, sd, sem], * ... * ] * ] * @throws BadDataException if less than two A factors, or if B factors or values have different number elements */ public static function twoWay(array ...$data) { // Must have at least two rows (two types of factor A) $r = count($data); if ($r < 2) { throw new Exception\BadDataException('Must have at least two rows (two types of factor A)'); } // All samples must have the same number the second factor B $c = count($data[0]); for ($i = 1; $i < $r; $i++) { if (count($data[$i]) !== $c) { throw new Exception\BadDataException('All samples must have the same number of the second factor B'); } } // Each AB factor interaction must have the same number of values $v = count($data[0][0]); for ($i = 0; $i < $r; $i++) { for ($j = 0; $j < $c; $j++) { if (count($data[$i][$j]) !== $v) { throw new Exception\BadDataException('Each AB factor interaction must have the same number of values'); } } } // Aggregates for all elements, rows (factor A), and columns (factor B) $all_elements = []; $A_elements = []; $B_elements = []; // Summaries for factor A, factor B, AB, and total $summary_A = []; $summary_B = []; $summary_AB = []; $summary_total = []; // Summary data for each AB // And aggregate all elements and elements for factor A foreach ($data as $A => $Bs) { $A_elements[$A] = []; foreach ($Bs as $B => $values) { // Aggregates $all_elements = array_merge($all_elements, $values); $A_elements[$A] = array_merge($A_elements[$A], $values); // AB summary $summary_AB[$A][$B] = []; $summary_AB[$A][$B]['n'] = $c; $summary_AB[$A][$B]['sum'] = array_sum($values); $summary_AB[$A][$B]['mean'] = Average::mean($values); $summary_AB[$A][$B]['SS'] = RandomVariable::sumOfSquares($values); $summary_AB[$A][$B]['variance'] = Descriptive::sampleVariance($values); $summary_AB[$A][$B]['sd'] = Descriptive::sd($values); $summary_AB[$A][$B]['sem'] = RandomVariable::standardErrorOfTheMean($values); } } // Aggregate elements for factor B for ($B = 0; $B < $c; $B++) { $B_elements[$B] = []; foreach ($data as $factor1s) { $B_elements[$B] = array_merge($B_elements[$B], $factor1s[$B]); } } // Factor A summary foreach ($A_elements as $A => $elements) { $summary_A[$A] = []; $summary_A[$A]['n'] = count($elements); $summary_A[$A]['sum'] = array_sum($elements); $summary_A[$A]['mean'] = Average::mean($elements); $summary_A[$A]['SS'] = RandomVariable::sumOfSquares($elements); $summary_A[$A]['variance'] = Descriptive::sampleVariance($elements); $summary_A[$A]['sd'] = Descriptive::sd($elements); $summary_A[$A]['sem'] = RandomVariable::standardErrorOfTheMean($elements); } // Factor B summary foreach ($B_elements as $B => $elements) { $summary_B[$B] = []; $summary_B[$B]['n'] = count($elements); $summary_B[$B]['sum'] = array_sum($elements); $summary_B[$B]['mean'] = Average::mean($elements); $summary_B[$B]['SS'] = RandomVariable::sumOfSquares($elements); $summary_B[$B]['variance'] = Descriptive::sampleVariance($elements); $summary_B[$B]['sd'] = Descriptive::sd($elements); $summary_B[$B]['sem'] = RandomVariable::standardErrorOfTheMean($elements); } // Totals summary $μ = Average::mean($all_elements); $summary_total = ['n' => count($all_elements), 'sum' => array_sum($all_elements), 'mean' => $μ, 'SS' => RandomVariable::sumOfSquares($all_elements), 'variance' => Descriptive::sampleVariance($all_elements), 'sd' => Descriptive::sd($all_elements), 'sem' => RandomVariable::standardErrorOfTheMean($all_elements)]; // Sum of squares factor A $SSA = array_sum(array_map(function ($f1) use($μ) { return $f1['n'] * ($f1['mean'] - $μ) ** 2; }, $summary_A)); // Sum of squares factor B $SSB = array_sum(array_map(function ($B) use($μ) { return $B['n'] * ($B['mean'] - $μ) ** 2; }, $summary_B)); // Sum of squares within (error) $SSW = 0; foreach ($data as $A => $Bs) { foreach ($Bs as $B => $values) { foreach ($values as $value) { $SSW += ($value - $summary_AB[$A][$B]['mean']) ** 2; } } } // Sum of squares total $SST = 0; foreach ($data as $A => $Bs) { foreach ($Bs as $B => $values) { foreach ($values as $value) { $SST += ($value - $μ) ** 2; } } } // Sum of squares AB interaction $SSAB = $SST - $SSA - $SSB - $SSW; // Degrees of freedom $dfA = $r - 1; $dfB = $c - 1; $dfAB = ($r - 1) * ($c - 1); $dfW = $summary_total['n'] - $r * $c; $dfT = $summary_total['n'] - 1; // Mean squares $MSA = $SSA / $dfA; $MSB = $SSB / $dfB; $MSAB = $SSAB / $dfAB; $MSW = $SSW / $dfW; // F test statistics $FA = $MSA / $MSW; $FB = $MSB / $MSW; $FAB = $MSAB / $MSW; // P values $PA = F::above($FA, $dfA, $dfW); $PB = F::above($FB, $dfB, $dfW); $PAB = F::above($FAB, $dfAB, $dfW); // Return ANOVA report return ['ANOVA' => ['factorA' => ['SS' => $SSA, 'df' => $dfA, 'MS' => $MSA, 'F' => $FA, 'P' => $PA], 'factorB' => ['SS' => $SSB, 'df' => $dfB, 'MS' => $MSB, 'F' => $FB, 'P' => $PB], 'interaction' => ['SS' => $SSAB, 'df' => $dfAB, 'MS' => $MSAB, 'F' => $FAB, 'P' => $PAB], 'error' => ['SS' => $SSW, 'df' => $dfW, 'MS' => $MSW], 'total' => ['SS' => $SST, 'df' => $dfT]], 'total_summary' => $summary_total, 'summary_factorA' => $summary_A, 'summary_factorB' => $summary_B, 'summary_interaction' => $summary_AB]; }
/** * The probabilty associated with the regression F Statistic * * F probability = F distribution CDF(F,d₁,d₂) * * where: * F = F statistic * d₁ = degrees of freedom 1 * d₂ = degrees of freedom 2 * * ν = degrees of freedom * * @return number */ public function FProbability() { $F = $this->FStatistic(); $n = $this->n; // Degrees of freedom // Need to make sure the 1 in $d₁ should not be $this->fit_parameters; $ν = $this->ν; $d₁ = $n - $ν - 1; $d₂ = $ν; return F::CDF($F, $d₁, $d₂); }