/** * This is the function that should be called to get the * classifier_trainer to start training a logistic regression instance for * a particular classifier. The class label corresponding to the * classifier to be finalized should be passed as the second command-line * argument. */ function start() { global $argv; CrawlDaemon::init($argv, "classifier_trainer"); $label = $argv[2]; crawlLog("Initializing classifier trainer log..", $label . '-classifier_trainer', true); $classifier = Classifier::getClassifier($label); $classifier->prepareToFinalize(); $classifier->finalize(); Classifier::setClassifier($classifier); crawlLog("Training complete.\n"); CrawlDaemon::stop('classifier_trainer', $label); }
/** * Creates a new classifier for a label, first deleting any existing * classifier with the same label. * * @param string $label class label of the new classifier * @return object created classifier instance */ function makeFreshClassifier($label) { if ($classifier = Classifier::getClassifier($label)) { $this->deleteClassifier($label); } $classifier = new Classifier($label, $this->options['cls']); Classifier::setClassifier($classifier); return $classifier; }
/** * Finds the next document for which to request a label, sometimes first * recording the label that the user selected for the last document. This * method should only be called via an XmlHttpRequest initiated by the edit * classifier JavaScript, and consequently it always writes out * JSON-encoded data, which is easily decoded by the page JavaScript. */ function classify() { $arg = $this->clean($_REQUEST['arg'], 'string'); $label = $this->clean($_REQUEST['label'], 'string'); if (isset($_REQUEST['index'])) { $index = $this->clean($_REQUEST['index'], 'int'); if (intval($index) == 1) { $index = $this->model("crawl")->getCurrentIndexDatabaseName(); } $source_type = $this->clean($_REQUEST['type'], 'string'); $keywords = $this->clean($_REQUEST['keywords'], 'string'); } /* The call to prepareToLabel is important; it loads all of the data required to manage the training set from disk, and also determines what will be saved *back* to disk later. */ $classifier = Classifier::getClassifier($label); $classifier->prepareToLabel(); $data = array(); switch ($arg) { case 'getdocs': /* Load documents in from a user-specified index, and find the next best one to label (for 'manual' source type), or label them all with a single label (for either the 'positive' or 'negative' source types). */ $mix_iterator = $this->buildClassifierCrawlMix($label, $index, $keywords); if ($source_type == 'manual') { $num_docs = $classifier->initBuffer($mix_iterator); $classifier->computeBufferDensities(); $data['num_docs'] = $num_docs; list($new_doc, $disagreement) = $classifier->findNextDocumentToLabel(); if ($new_doc) { $score = $classifier->classify($new_doc); $data['new_doc'] = $this->prepareUnlabelledDocument($new_doc, $score, $disagreement, $index, $keywords); } Classifier::setClassifier($classifier); } else { if ($source_type == 'positive' || $source_type == 'negative') { $doc_label = $source_type == 'positive' ? 1 : -1; $add_count = $classifier->addAllDocuments($mix_iterator, $doc_label); if ($add_count > 0) { /* Pass true to always update accuracy after adding a batch of documents all at once. */ $classifier->train(true); Classifier::setClassifier($classifier); } $data['add_count'] = $add_count; } } break; case 'addlabel': /* First label the last candidate document presented to the user (potentially skipping it instead of actually applying a label), then pick the next best candidate for labeling. When skipping a document instead of adding a label, avoid re-training since the training set hasn't actually changed. */ $doc = $_REQUEST['doc_to_label']; $docid = $this->clean($doc['docid'], 'int'); $key = webdecode($this->clean($doc['key'], 'string')); $doc_label = $this->clean($doc['label'], 'int'); $mix_iterator = $this->retrieveClassifierCrawlMix($label); $labels_changed = $classifier->labelDocument($key, $doc_label); $num_docs = $classifier->refreshBuffer($mix_iterator); $classifier->computeBufferDensities(); $data['num_docs'] = $num_docs; if ($labels_changed) { $update_accuracy = $classifier->total > 0 && $classifier->total % 10 == 0; $classifier->train($update_accuracy); } list($new_doc, $disagreement) = $classifier->findNextDocumentToLabel(); if ($new_doc) { $score = $classifier->classify($new_doc); $data['new_doc'] = $this->prepareUnlabelledDocument($new_doc, $score, $disagreement, $index, $keywords); } Classifier::setClassifier($classifier); break; case 'updateaccuracy': /* Don't do anything other than re-compute the accuracy for the current training set. */ $classifier->updateAccuracy(); Classifier::setClassifier($classifier); break; } /* No matter which activity we ended up carrying out, always include the statistics that *might* have changed so that the client can just naively keep them up to date. */ $data['positive'] = $classifier->positive; $data['negative'] = $classifier->negative; $data['total'] = $classifier->total; $data['accuracy'] = $classifier->accuracy; /* Pass along a new authentication token so that the client can make a new authenticated request after this one. */ $data['authTime'] = strval(time()); $data['authSession'] = md5($data['authTime'] . AUTH_KEY); $response = json_encode($data); header("Content-Type: application/json"); header("Content-Length: " . strlen($response)); echo $response; }