/** * Handles the particulars of editing a classifier, which includes changing * its label and adding training examples. * * This activity directly handles changing the class label, but not adding * training examples. The latter activity is done interactively without * reloading the page via XmlHttpRequests, coordinated by the classifier * controller dedicated to that task. * * @param array $data data to be passed on to the view * @param array $classifiers map from class labels to their associated * classifiers * @param array $machine_urls string urls of machines managed by this * Yioop name server */ function editClassifier(&$data, $classifiers, $machine_urls) { $parent = $this->parent; $data['ELEMENT'] = 'editclassifier'; $data['INCLUDE_SCRIPTS'] = array('classifiers'); // We want recrawls, but not archive crawls. $crawls = $parent->model("crawl")->getCrawlList(false, true, $machine_urls); $data['CRAWLS'] = $crawls; $classifier = $classifiers[$data['class_label']]; if (isset($_REQUEST['update']) && $_REQUEST['update'] == 'update') { if (isset($_REQUEST['rename_label'])) { $new_label = substr($parent->clean($_REQUEST['rename_label'], 'string'), 0, NAME_LEN); $new_label = preg_replace('/[^a-zA-Z0-9_]/', '', $new_label); if (!isset($classifiers[$new_label])) { $old_label = $classifier->class_label; $classifier->class_label = $new_label; Classifier::setClassifier($classifier); Classifier::deleteClassifier($old_label); $data['class_label'] = $new_label; } else { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">" . tl('crawl_component_classifier_exists') . '</h1>\');'; } } } $data['classifier'] = $classifier; // Translations for the classification javascript. $data['SCRIPT'] .= "window.tl = {" . 'crawl_component_load_failed:"' . tl('crawl_component_load_failed') . '",' . 'crawl_component_loading:"' . tl('crawl_component_loading') . '",' . 'crawl_component_added_examples:"' . tl('crawl_component_added_examples') . '",' . 'crawl_component_label_update_failed:"' . tl('crawl_component_label_update_failed') . '",' . 'crawl_component_updating:"' . tl('crawl_component_updating') . '",' . 'crawl_component_acc_update_failed:"' . tl('crawl_component_acc_update_failed') . '",' . 'crawl_component_na:"' . tl('crawl_component_na') . '",' . 'crawl_component_no_docs:"' . tl('crawl_component_no_docs') . '",' . 'crawl_component_num_docs:"' . tl('crawl_component_num_docs') . '",' . 'crawl_component_in_class:"' . tl('crawl_component_in_class') . '",' . 'crawl_component_not_in_class:"' . tl('crawl_component_not_in_class') . '",' . 'crawl_component_skip:"' . tl('crawl_component_skip') . '",' . 'crawl_component_prediction:"' . tl('crawl_component_prediction') . '",' . 'crawl_component_scores:"' . tl('crawl_component_scores') . '"' . '};'; /* We pass along authentication information to the client, so that it can authenticate any XmlHttpRequests that it makes in order to label documents. */ $time = strval(time()); $session = md5($time . AUTH_KEY); $data['SCRIPT'] .= "Classifier.initialize(" . "'{$data['class_label']}'," . "'{$session}'," . "'{$time}');"; }
/** * This is the function that should be called to get the * classifier_trainer to start training a logistic regression instance for * a particular classifier. The class label corresponding to the * classifier to be finalized should be passed as the second command-line * argument. */ function start() { global $argv; CrawlDaemon::init($argv, "classifier_trainer"); $label = $argv[2]; crawlLog("Initializing classifier trainer log..", $label . '-classifier_trainer', true); $classifier = Classifier::getClassifier($label); $classifier->prepareToFinalize(); $classifier->finalize(); Classifier::setClassifier($classifier); crawlLog("Training complete.\n"); CrawlDaemon::stop('classifier_trainer', $label); }
/** * Finalizes the current classifier, uses it to classify all test * documents, and logs the classification error. The current classifier is * saved to disk after finalizing (though not before), and left in * `classify' mode. The iterator over the test dataset is reset for the * next round of testing (if any). * * @param object $classifier classifier instance to test * @param array $data the array of training and test datasets, constructed * by loadDataset, of which only the `test' dataset it used. */ function testClassifier($classifier, $data) { $classifier->prepareToFinalize(); $classifier->finalize(); Classifier::setClassifier($classifier); $classifier->prepareToClassify(); $wrong = 0; $total = 0; $pages = $data['test']; while (!$pages->end_of_iterator) { $page = $pages->nextPage(); $score = $classifier->classify($page); $page_label = $score >= 0.5 ? 1 : -1; if ($page_label != $page['TRUE_LABEL']) { $wrong++; } $total++; } $error = (double) $wrong / $total; $this->log(0, 'error = %.4f', $error); $pages->reset(); }
/** * Finds the next document for which to request a label, sometimes first * recording the label that the user selected for the last document. This * method should only be called via an XmlHttpRequest initiated by the edit * classifier JavaScript, and consequently it always writes out * JSON-encoded data, which is easily decoded by the page JavaScript. */ function classify() { $arg = $this->clean($_REQUEST['arg'], 'string'); $label = $this->clean($_REQUEST['label'], 'string'); if (isset($_REQUEST['index'])) { $index = $this->clean($_REQUEST['index'], 'int'); if (intval($index) == 1) { $index = $this->model("crawl")->getCurrentIndexDatabaseName(); } $source_type = $this->clean($_REQUEST['type'], 'string'); $keywords = $this->clean($_REQUEST['keywords'], 'string'); } /* The call to prepareToLabel is important; it loads all of the data required to manage the training set from disk, and also determines what will be saved *back* to disk later. */ $classifier = Classifier::getClassifier($label); $classifier->prepareToLabel(); $data = array(); switch ($arg) { case 'getdocs': /* Load documents in from a user-specified index, and find the next best one to label (for 'manual' source type), or label them all with a single label (for either the 'positive' or 'negative' source types). */ $mix_iterator = $this->buildClassifierCrawlMix($label, $index, $keywords); if ($source_type == 'manual') { $num_docs = $classifier->initBuffer($mix_iterator); $classifier->computeBufferDensities(); $data['num_docs'] = $num_docs; list($new_doc, $disagreement) = $classifier->findNextDocumentToLabel(); if ($new_doc) { $score = $classifier->classify($new_doc); $data['new_doc'] = $this->prepareUnlabelledDocument($new_doc, $score, $disagreement, $index, $keywords); } Classifier::setClassifier($classifier); } else { if ($source_type == 'positive' || $source_type == 'negative') { $doc_label = $source_type == 'positive' ? 1 : -1; $add_count = $classifier->addAllDocuments($mix_iterator, $doc_label); if ($add_count > 0) { /* Pass true to always update accuracy after adding a batch of documents all at once. */ $classifier->train(true); Classifier::setClassifier($classifier); } $data['add_count'] = $add_count; } } break; case 'addlabel': /* First label the last candidate document presented to the user (potentially skipping it instead of actually applying a label), then pick the next best candidate for labeling. When skipping a document instead of adding a label, avoid re-training since the training set hasn't actually changed. */ $doc = $_REQUEST['doc_to_label']; $docid = $this->clean($doc['docid'], 'int'); $key = webdecode($this->clean($doc['key'], 'string')); $doc_label = $this->clean($doc['label'], 'int'); $mix_iterator = $this->retrieveClassifierCrawlMix($label); $labels_changed = $classifier->labelDocument($key, $doc_label); $num_docs = $classifier->refreshBuffer($mix_iterator); $classifier->computeBufferDensities(); $data['num_docs'] = $num_docs; if ($labels_changed) { $update_accuracy = $classifier->total > 0 && $classifier->total % 10 == 0; $classifier->train($update_accuracy); } list($new_doc, $disagreement) = $classifier->findNextDocumentToLabel(); if ($new_doc) { $score = $classifier->classify($new_doc); $data['new_doc'] = $this->prepareUnlabelledDocument($new_doc, $score, $disagreement, $index, $keywords); } Classifier::setClassifier($classifier); break; case 'updateaccuracy': /* Don't do anything other than re-compute the accuracy for the current training set. */ $classifier->updateAccuracy(); Classifier::setClassifier($classifier); break; } /* No matter which activity we ended up carrying out, always include the statistics that *might* have changed so that the client can just naively keep them up to date. */ $data['positive'] = $classifier->positive; $data['negative'] = $classifier->negative; $data['total'] = $classifier->total; $data['accuracy'] = $classifier->accuracy; /* Pass along a new authentication token so that the client can make a new authenticated request after this one. */ $data['authTime'] = strval(time()); $data['authSession'] = md5($data['authTime'] . AUTH_KEY); $response = json_encode($data); header("Content-Type: application/json"); header("Content-Length: " . strlen($response)); echo $response; }