Esempio n. 1
0
 /**
  * Handles the particulars of editing a classifier, which includes changing
  * its label and adding training examples.
  *
  * This activity directly handles changing the class label, but not adding
  * training examples. The latter activity is done interactively without
  * reloading the page via XmlHttpRequests, coordinated by the classifier
  * controller dedicated to that task.
  *
  * @param array $data data to be passed on to the view
  * @param array $classifiers map from class labels to their associated
  *    classifiers
  * @param array $machine_urls string urls of machines managed by this
  *    Yioop name server
  */
 function editClassifier(&$data, $classifiers, $machine_urls)
 {
     $parent = $this->parent;
     $data['ELEMENT'] = 'editclassifier';
     $data['INCLUDE_SCRIPTS'] = array('classifiers');
     // We want recrawls, but not archive crawls.
     $crawls = $parent->model("crawl")->getCrawlList(false, true, $machine_urls);
     $data['CRAWLS'] = $crawls;
     $classifier = $classifiers[$data['class_label']];
     if (isset($_REQUEST['update']) && $_REQUEST['update'] == 'update') {
         if (isset($_REQUEST['rename_label'])) {
             $new_label = substr($parent->clean($_REQUEST['rename_label'], 'string'), 0, NAME_LEN);
             $new_label = preg_replace('/[^a-zA-Z0-9_]/', '', $new_label);
             if (!isset($classifiers[$new_label])) {
                 $old_label = $classifier->class_label;
                 $classifier->class_label = $new_label;
                 Classifier::setClassifier($classifier);
                 Classifier::deleteClassifier($old_label);
                 $data['class_label'] = $new_label;
             } else {
                 $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">" . tl('crawl_component_classifier_exists') . '</h1>\');';
             }
         }
     }
     $data['classifier'] = $classifier;
     // Translations for the classification javascript.
     $data['SCRIPT'] .= "window.tl = {" . 'crawl_component_load_failed:"' . tl('crawl_component_load_failed') . '",' . 'crawl_component_loading:"' . tl('crawl_component_loading') . '",' . 'crawl_component_added_examples:"' . tl('crawl_component_added_examples') . '",' . 'crawl_component_label_update_failed:"' . tl('crawl_component_label_update_failed') . '",' . 'crawl_component_updating:"' . tl('crawl_component_updating') . '",' . 'crawl_component_acc_update_failed:"' . tl('crawl_component_acc_update_failed') . '",' . 'crawl_component_na:"' . tl('crawl_component_na') . '",' . 'crawl_component_no_docs:"' . tl('crawl_component_no_docs') . '",' . 'crawl_component_num_docs:"' . tl('crawl_component_num_docs') . '",' . 'crawl_component_in_class:"' . tl('crawl_component_in_class') . '",' . 'crawl_component_not_in_class:"' . tl('crawl_component_not_in_class') . '",' . 'crawl_component_skip:"' . tl('crawl_component_skip') . '",' . 'crawl_component_prediction:"' . tl('crawl_component_prediction') . '",' . 'crawl_component_scores:"' . tl('crawl_component_scores') . '"' . '};';
     /*
       We pass along authentication information to the client, so that it
       can authenticate any XmlHttpRequests that it makes in order to label
       documents.
     */
     $time = strval(time());
     $session = md5($time . AUTH_KEY);
     $data['SCRIPT'] .= "Classifier.initialize(" . "'{$data['class_label']}'," . "'{$session}'," . "'{$time}');";
 }
Esempio n. 2
0
 /**
  * This is the function that should be called to get the
  * classifier_trainer to start training a logistic regression instance for
  * a particular classifier. The class label corresponding to the
  * classifier to be finalized should be passed as the second command-line
  * argument.
  */
 function start()
 {
     global $argv;
     CrawlDaemon::init($argv, "classifier_trainer");
     $label = $argv[2];
     crawlLog("Initializing classifier trainer log..", $label . '-classifier_trainer', true);
     $classifier = Classifier::getClassifier($label);
     $classifier->prepareToFinalize();
     $classifier->finalize();
     Classifier::setClassifier($classifier);
     crawlLog("Training complete.\n");
     CrawlDaemon::stop('classifier_trainer', $label);
 }
Esempio n. 3
0
 /**
  * Finalizes the current classifier, uses it to classify all test
  * documents, and logs the classification error.  The current classifier is
  * saved to disk after finalizing (though not before), and left in
  * `classify' mode. The iterator over the test dataset is reset for the
  * next round of testing (if any).
  *
  * @param object $classifier classifier instance to test
  * @param array $data the array of training and test datasets, constructed
  * by loadDataset, of which only the `test' dataset it used.
  */
 function testClassifier($classifier, $data)
 {
     $classifier->prepareToFinalize();
     $classifier->finalize();
     Classifier::setClassifier($classifier);
     $classifier->prepareToClassify();
     $wrong = 0;
     $total = 0;
     $pages = $data['test'];
     while (!$pages->end_of_iterator) {
         $page = $pages->nextPage();
         $score = $classifier->classify($page);
         $page_label = $score >= 0.5 ? 1 : -1;
         if ($page_label != $page['TRUE_LABEL']) {
             $wrong++;
         }
         $total++;
     }
     $error = (double) $wrong / $total;
     $this->log(0, 'error = %.4f', $error);
     $pages->reset();
 }
Esempio n. 4
0
 /**
  * Finds the next document for which to request a label, sometimes first
  * recording the label that the user selected for the last document. This
  * method should only be called via an XmlHttpRequest initiated by the edit
  * classifier JavaScript, and consequently it always writes out
  * JSON-encoded data, which is easily decoded by the page JavaScript.
  */
 function classify()
 {
     $arg = $this->clean($_REQUEST['arg'], 'string');
     $label = $this->clean($_REQUEST['label'], 'string');
     if (isset($_REQUEST['index'])) {
         $index = $this->clean($_REQUEST['index'], 'int');
         if (intval($index) == 1) {
             $index = $this->model("crawl")->getCurrentIndexDatabaseName();
         }
         $source_type = $this->clean($_REQUEST['type'], 'string');
         $keywords = $this->clean($_REQUEST['keywords'], 'string');
     }
     /*
       The call to prepareToLabel is important; it loads all of the data
       required to manage the training set from disk, and also determines
       what will be saved *back* to disk later.
     */
     $classifier = Classifier::getClassifier($label);
     $classifier->prepareToLabel();
     $data = array();
     switch ($arg) {
         case 'getdocs':
             /*
               Load documents in from a user-specified index, and find the
               next best one to label (for 'manual' source type), or label
               them all with a single label (for either the 'positive' or
               'negative' source types).
             */
             $mix_iterator = $this->buildClassifierCrawlMix($label, $index, $keywords);
             if ($source_type == 'manual') {
                 $num_docs = $classifier->initBuffer($mix_iterator);
                 $classifier->computeBufferDensities();
                 $data['num_docs'] = $num_docs;
                 list($new_doc, $disagreement) = $classifier->findNextDocumentToLabel();
                 if ($new_doc) {
                     $score = $classifier->classify($new_doc);
                     $data['new_doc'] = $this->prepareUnlabelledDocument($new_doc, $score, $disagreement, $index, $keywords);
                 }
                 Classifier::setClassifier($classifier);
             } else {
                 if ($source_type == 'positive' || $source_type == 'negative') {
                     $doc_label = $source_type == 'positive' ? 1 : -1;
                     $add_count = $classifier->addAllDocuments($mix_iterator, $doc_label);
                     if ($add_count > 0) {
                         /*
                           Pass true to always update accuracy after adding a
                           batch of documents all at once.
                         */
                         $classifier->train(true);
                         Classifier::setClassifier($classifier);
                     }
                     $data['add_count'] = $add_count;
                 }
             }
             break;
         case 'addlabel':
             /*
               First label the last candidate document presented to the
               user (potentially skipping it instead of actually applying a
               label), then pick the next best candidate for labeling.
               When skipping a document instead of adding a label, avoid
               re-training since the training set hasn't actually changed.
             */
             $doc = $_REQUEST['doc_to_label'];
             $docid = $this->clean($doc['docid'], 'int');
             $key = webdecode($this->clean($doc['key'], 'string'));
             $doc_label = $this->clean($doc['label'], 'int');
             $mix_iterator = $this->retrieveClassifierCrawlMix($label);
             $labels_changed = $classifier->labelDocument($key, $doc_label);
             $num_docs = $classifier->refreshBuffer($mix_iterator);
             $classifier->computeBufferDensities();
             $data['num_docs'] = $num_docs;
             if ($labels_changed) {
                 $update_accuracy = $classifier->total > 0 && $classifier->total % 10 == 0;
                 $classifier->train($update_accuracy);
             }
             list($new_doc, $disagreement) = $classifier->findNextDocumentToLabel();
             if ($new_doc) {
                 $score = $classifier->classify($new_doc);
                 $data['new_doc'] = $this->prepareUnlabelledDocument($new_doc, $score, $disagreement, $index, $keywords);
             }
             Classifier::setClassifier($classifier);
             break;
         case 'updateaccuracy':
             /*
               Don't do anything other than re-compute the accuracy for the
               current training set.
             */
             $classifier->updateAccuracy();
             Classifier::setClassifier($classifier);
             break;
     }
     /*
       No matter which activity we ended up carrying out, always include
       the statistics that *might* have changed so that the client can just
       naively keep them up to date.
     */
     $data['positive'] = $classifier->positive;
     $data['negative'] = $classifier->negative;
     $data['total'] = $classifier->total;
     $data['accuracy'] = $classifier->accuracy;
     /*
       Pass along a new authentication token so that the client can make a
       new authenticated request after this one.
     */
     $data['authTime'] = strval(time());
     $data['authSession'] = md5($data['authTime'] . AUTH_KEY);
     $response = json_encode($data);
     header("Content-Type: application/json");
     header("Content-Length: " . strlen($response));
     echo $response;
 }