This classifier is based off *Tackling the Poor Assumptions of Naive Bayes Text Classifiers* by Jason Rennie
Author: Cam Spiers (camspiers@gmail.com)
Inheritance: extends Classifier
<?php

ini_set('memory_limit', '512M');
require_once __DIR__ . '/../vendor/autoload.php';
use Camspiers\StatisticalClassifier\DataSource\Directory;
use Camspiers\StatisticalClassifier\Classifier\ComplementNaiveBayes;
$cats = array('alt.atheism', 'comp.graphics', 'rec.motorcycles', 'sci.crypt', 'soc.religion.christian', 'talk.religion.misc');
//bayes
//alt.atheism: 0.81504702194357
//comp.graphics: 0.88688946015424
//rec.motorcycles: 0.98743718592965
//sci.crypt: 0.93939393939394
//soc.religion.christian: 0.90954773869347
//talk.religion.misc: 0.70517928286853
$classifier = new ComplementNaiveBayes(new Directory(array('directory' => __DIR__ . '/../resources/20news-bydate/20news-bydate-train', 'include' => $cats)));
$testSource = new Directory(array('directory' => __DIR__ . '/../resources/20news-bydate/20news-bydate-test', 'include' => $cats));
$data = $testSource->getData();
$stats = array();
foreach ($data as $category => $documents) {
    $stats[$category] = array(0, count($documents));
    foreach ($documents as $document) {
        if ($classifier->is($category, $document)) {
            $stats[$category][0]++;
        }
    }
}
foreach ($stats as $category => $data) {
    echo $category, ': ', $data[0] / $data[1], PHP_EOL;
}
<?php

require_once __DIR__ . '/../vendor/autoload.php';
// Using a plain data array source for simplicity
use Camspiers\StatisticalClassifier\DataSource\DataArray;
use Camspiers\StatisticalClassifier\Classifier\ComplementNaiveBayes;
$source = new DataArray(array(array('category' => 'spam', 'document' => 'Some spam document'), array('category' => 'spam', 'document' => 'Another spam document'), array('category' => 'ham', 'document' => 'Some ham document'), array('category' => 'ham', 'document' => 'Another ham document')));
$source->addDocument('spam', 'Another spam document');
$source->addDocument('ham', 'Another ham document');
$c = new ComplementNaiveBayes($source);
echo $c->classify("Some ham document"), PHP_EOL;
<?php

require_once __DIR__ . '/../vendor/autoload.php';
mb_internal_encoding('UTF-8');
use Camspiers\StatisticalClassifier\DataSource\DataArray;
use Camspiers\StatisticalClassifier\Classifier\ComplementNaiveBayes;
$source = new DataArray();
if (!file_exists(__DIR__ . '/../resources/language-samples')) {
    throw new Exception('Please extract language-samples.zip in resources/');
}
foreach (glob(__DIR__ . '/../resources/language-samples/*') as $file) {
    $source->addDocument(basename($file), file_get_contents($file));
}
$nb = new ComplementNaiveBayes($source);
$examples = array("Agricultura (-ae, f.), sensu latissimo, est summa omnium artium et scientiarum et technologiarum quae de terris colendis et animalibus creandis curant, ut poma, frumenta, charas, carnes, textilia, et aliae res e terra bene producantur. Specialius, agronomia est ars et scientia quae terris colendis student, agricultio autem animalibus creandis.", "El llatí és una llengua indoeuropea de la branca itàlica, parlada antigament pels romans. A partir de l'evolució de la seva versió vulgar en sorgiren les llengües romàniques que sobreviuen avui dia.", "hola", "Hi there, this is a tiny text", "* This file implements in memory hash tables with insert/del/replace/find/\n             * get-random-element operations. Hash tables will auto resize if needed\n              * tables of power of two in size are used, collisions are handled by\n               * chaining. See the source code for more information... :)", "House of Cards is an American political drama series developed and produced by Beau Willimon. It is an adaptation of a previous BBC miniseries of the same name which is based on the novel by Michael Dobbs. The entire first season premiered on February 1, 2013, on the streaming service Netflix.[1] A second season of 13 episodes is currently in production.[1][2]");
foreach ($examples as $example) {
    echo $nb->classify($example), PHP_EOL;
}