/** * Like the TrainAndTest activity, but uses active training in order to * choose the documents to add to the training set. The method simulates * the process that an actual user would go through in order to label * documents for addition to the training set, then tests performance at * the specified intervals. * * @param string $label class label of the new classifier * @param string $dataset_name name of the dataset to train and test on */ function runActiveTrainAndTest($label, $dataset_name) { $this->setDefault('max_train', 200); $this->logOptions(); $classifier = $this->makeFreshClassifier($label); $data = $this->loadDataset($dataset_name, $label); $pages = $data['train']; $classifier->prepareToLabel(); $classifier->initBuffer($pages); $end = min($this->options['max_train'], $pages->length); for ($i = 1; $i <= $end; $i++) { list($new_doc, $disagreement) = $classifier->findNextDocumentToLabel(); if ($new_doc) { $key = Classifier::makeKey($new_doc); $doc_label = $new_doc['TRUE_LABEL']; $classifier->labelDocument($key, $doc_label); $classifier->refreshBuffer($pages); $classifier->computeBufferDensities(); $classifier->train(); } if ($this->isTestPoint($i, $end)) { Classifier::setClassifier($classifier); $this->testClassifier($classifier, $data); $classifier->prepareToLabel(); } } }
/** * Creates a fresh array from an existing page summary array, and augments * it with extra data relevant to the labeling interface on the client. * * @param array $page original page summary array * @param float $score classification score (estimated by the Naive Bayes * text classification algorithm) for $page * @param float $disagreement disagreement score computed for $page * @param int $crawl_time index the page came from * @param string $keywords query supplied to the crawl mix used to find * $page * @return array reduced page summary structure containing only the * information that the client needs to display a summary of the page */ function prepareUnlabelledDocument($page, $score, $disagreement, $crawl_time, $keywords) { $phrase_model = $this->model("phrase"); // Highlight the query keywords, if any. $disjunct_phrases = explode("|", $keywords); $words = array(); foreach ($disjunct_phrases as $disjunct_phrase) { list($word_struct, $format_words) = $phrase_model->parseWordStructConjunctiveQuery($disjunct_phrase); $words = array_merge($words, $format_words); } $title = $phrase_model->boldKeywords($page[self::TITLE], $words); $description = $phrase_model->getSnippets(strip_tags($page[self::DESCRIPTION]), $words, 400); $description = $phrase_model->boldKeywords($description, $words); $cache_link = "?c=search&a=cache" . "&q=" . urlencode($keywords) . "&arg=" . urlencode($page[self::URL]) . "&its=" . $crawl_time; /* Note that the confidence is a transformation of the score that converts it into a value between 0 and 1, where it's 0 if the score was exactly 0.5, and increases toward 1 as the score either increases toward 1 or decreases toward 0. */ return array('title' => $title, 'url' => $page[self::URL], 'key' => webencode(Classifier::makeKey($page)), 'cache_link' => $cache_link, 'description' => $description, 'score' => $score, 'positive' => $score >= 0.5 ? 1 : 0, 'confidence' => abs($score - 0.5) / 0.5, 'disagreement' => $disagreement); }