Esempio n. 1
0
 /**
  * Like the TrainAndTest activity, but uses active training in order to
  * choose the documents to add to the training set. The method simulates
  * the process that an actual user would go through in order to label
  * documents for addition to the training set, then tests performance at
  * the specified intervals.
  *
  * @param string $label class label of the new classifier
  * @param string $dataset_name name of the dataset to train and test on
  */
 function runActiveTrainAndTest($label, $dataset_name)
 {
     $this->setDefault('max_train', 200);
     $this->logOptions();
     $classifier = $this->makeFreshClassifier($label);
     $data = $this->loadDataset($dataset_name, $label);
     $pages = $data['train'];
     $classifier->prepareToLabel();
     $classifier->initBuffer($pages);
     $end = min($this->options['max_train'], $pages->length);
     for ($i = 1; $i <= $end; $i++) {
         list($new_doc, $disagreement) = $classifier->findNextDocumentToLabel();
         if ($new_doc) {
             $key = Classifier::makeKey($new_doc);
             $doc_label = $new_doc['TRUE_LABEL'];
             $classifier->labelDocument($key, $doc_label);
             $classifier->refreshBuffer($pages);
             $classifier->computeBufferDensities();
             $classifier->train();
         }
         if ($this->isTestPoint($i, $end)) {
             Classifier::setClassifier($classifier);
             $this->testClassifier($classifier, $data);
             $classifier->prepareToLabel();
         }
     }
 }
Esempio n. 2
0
 /**
  * Creates a fresh array from an existing page summary array, and augments
  * it with extra data relevant to the labeling interface on the client.
  *
  * @param array $page original page summary array
  * @param float $score classification score (estimated by the Naive Bayes
  * text classification algorithm) for $page
  * @param float $disagreement disagreement score computed for $page
  * @param int $crawl_time index the page came from
  * @param string $keywords query supplied to the crawl mix used to find
  * $page
  * @return array reduced page summary structure containing only the
  * information that the client needs to display a summary of the page
  */
 function prepareUnlabelledDocument($page, $score, $disagreement, $crawl_time, $keywords)
 {
     $phrase_model = $this->model("phrase");
     // Highlight the query keywords, if any.
     $disjunct_phrases = explode("|", $keywords);
     $words = array();
     foreach ($disjunct_phrases as $disjunct_phrase) {
         list($word_struct, $format_words) = $phrase_model->parseWordStructConjunctiveQuery($disjunct_phrase);
         $words = array_merge($words, $format_words);
     }
     $title = $phrase_model->boldKeywords($page[self::TITLE], $words);
     $description = $phrase_model->getSnippets(strip_tags($page[self::DESCRIPTION]), $words, 400);
     $description = $phrase_model->boldKeywords($description, $words);
     $cache_link = "?c=search&amp;a=cache" . "&amp;q=" . urlencode($keywords) . "&amp;arg=" . urlencode($page[self::URL]) . "&amp;its=" . $crawl_time;
     /*
       Note that the confidence is a transformation of the score that
       converts it into a value between 0 and 1, where it's 0 if the score
       was exactly 0.5, and increases toward 1 as the score either
       increases toward 1 or decreases toward 0.
     */
     return array('title' => $title, 'url' => $page[self::URL], 'key' => webencode(Classifier::makeKey($page)), 'cache_link' => $cache_link, 'description' => $description, 'score' => $score, 'positive' => $score >= 0.5 ? 1 : 0, 'confidence' => abs($score - 0.5) / 0.5, 'disagreement' => $disagreement);
 }