Esempio n. 1
0
 /**
  * This is the function that should be called to get the
  * classifier_trainer to start training a logistic regression instance for
  * a particular classifier. The class label corresponding to the
  * classifier to be finalized should be passed as the second command-line
  * argument.
  */
 function start()
 {
     global $argv;
     CrawlDaemon::init($argv, "classifier_trainer");
     $label = $argv[2];
     crawlLog("Initializing classifier trainer log..", $label . '-classifier_trainer', true);
     $classifier = Classifier::getClassifier($label);
     $classifier->prepareToFinalize();
     $classifier->finalize();
     Classifier::setClassifier($classifier);
     crawlLog("Training complete.\n");
     CrawlDaemon::stop('classifier_trainer', $label);
 }
Esempio n. 2
0
 /**
  * Creates a new classifier for a label, first deleting any existing
  * classifier with the same label.
  *
  * @param string $label class label of the new classifier
  * @return object created classifier instance
  */
 function makeFreshClassifier($label)
 {
     if ($classifier = Classifier::getClassifier($label)) {
         $this->deleteClassifier($label);
     }
     $classifier = new Classifier($label, $this->options['cls']);
     Classifier::setClassifier($classifier);
     return $classifier;
 }
Esempio n. 3
0
 /**
  * Finds the next document for which to request a label, sometimes first
  * recording the label that the user selected for the last document. This
  * method should only be called via an XmlHttpRequest initiated by the edit
  * classifier JavaScript, and consequently it always writes out
  * JSON-encoded data, which is easily decoded by the page JavaScript.
  */
 function classify()
 {
     $arg = $this->clean($_REQUEST['arg'], 'string');
     $label = $this->clean($_REQUEST['label'], 'string');
     if (isset($_REQUEST['index'])) {
         $index = $this->clean($_REQUEST['index'], 'int');
         if (intval($index) == 1) {
             $index = $this->model("crawl")->getCurrentIndexDatabaseName();
         }
         $source_type = $this->clean($_REQUEST['type'], 'string');
         $keywords = $this->clean($_REQUEST['keywords'], 'string');
     }
     /*
       The call to prepareToLabel is important; it loads all of the data
       required to manage the training set from disk, and also determines
       what will be saved *back* to disk later.
     */
     $classifier = Classifier::getClassifier($label);
     $classifier->prepareToLabel();
     $data = array();
     switch ($arg) {
         case 'getdocs':
             /*
               Load documents in from a user-specified index, and find the
               next best one to label (for 'manual' source type), or label
               them all with a single label (for either the 'positive' or
               'negative' source types).
             */
             $mix_iterator = $this->buildClassifierCrawlMix($label, $index, $keywords);
             if ($source_type == 'manual') {
                 $num_docs = $classifier->initBuffer($mix_iterator);
                 $classifier->computeBufferDensities();
                 $data['num_docs'] = $num_docs;
                 list($new_doc, $disagreement) = $classifier->findNextDocumentToLabel();
                 if ($new_doc) {
                     $score = $classifier->classify($new_doc);
                     $data['new_doc'] = $this->prepareUnlabelledDocument($new_doc, $score, $disagreement, $index, $keywords);
                 }
                 Classifier::setClassifier($classifier);
             } else {
                 if ($source_type == 'positive' || $source_type == 'negative') {
                     $doc_label = $source_type == 'positive' ? 1 : -1;
                     $add_count = $classifier->addAllDocuments($mix_iterator, $doc_label);
                     if ($add_count > 0) {
                         /*
                           Pass true to always update accuracy after adding a
                           batch of documents all at once.
                         */
                         $classifier->train(true);
                         Classifier::setClassifier($classifier);
                     }
                     $data['add_count'] = $add_count;
                 }
             }
             break;
         case 'addlabel':
             /*
               First label the last candidate document presented to the
               user (potentially skipping it instead of actually applying a
               label), then pick the next best candidate for labeling.
               When skipping a document instead of adding a label, avoid
               re-training since the training set hasn't actually changed.
             */
             $doc = $_REQUEST['doc_to_label'];
             $docid = $this->clean($doc['docid'], 'int');
             $key = webdecode($this->clean($doc['key'], 'string'));
             $doc_label = $this->clean($doc['label'], 'int');
             $mix_iterator = $this->retrieveClassifierCrawlMix($label);
             $labels_changed = $classifier->labelDocument($key, $doc_label);
             $num_docs = $classifier->refreshBuffer($mix_iterator);
             $classifier->computeBufferDensities();
             $data['num_docs'] = $num_docs;
             if ($labels_changed) {
                 $update_accuracy = $classifier->total > 0 && $classifier->total % 10 == 0;
                 $classifier->train($update_accuracy);
             }
             list($new_doc, $disagreement) = $classifier->findNextDocumentToLabel();
             if ($new_doc) {
                 $score = $classifier->classify($new_doc);
                 $data['new_doc'] = $this->prepareUnlabelledDocument($new_doc, $score, $disagreement, $index, $keywords);
             }
             Classifier::setClassifier($classifier);
             break;
         case 'updateaccuracy':
             /*
               Don't do anything other than re-compute the accuracy for the
               current training set.
             */
             $classifier->updateAccuracy();
             Classifier::setClassifier($classifier);
             break;
     }
     /*
       No matter which activity we ended up carrying out, always include
       the statistics that *might* have changed so that the client can just
       naively keep them up to date.
     */
     $data['positive'] = $classifier->positive;
     $data['negative'] = $classifier->negative;
     $data['total'] = $classifier->total;
     $data['accuracy'] = $classifier->accuracy;
     /*
       Pass along a new authentication token so that the client can make a
       new authenticated request after this one.
     */
     $data['authTime'] = strval(time());
     $data['authSession'] = md5($data['authTime'] . AUTH_KEY);
     $response = json_encode($data);
     header("Content-Type: application/json");
     header("Content-Length: " . strlen($response));
     echo $response;
 }