/** * After robot, schedule, and index data have been uploaded and reassembled * as one big data file/string, this function splits that string into * each of these data types and then save the result into the appropriate * schedule sub-folder. Any temporary files used during uploading are then * deleted. * * @param string $filename name of temp file used to upload big string. * If uploaded data was small enough to be uploaded in one go, then * this should be "" -- the variable $_REQUEST["part"] will be used * instead * @return string $logging diagnostic info to be sent to fetcher about * what was done */ function handleUploadedData($filename = "") { if ($filename == "") { $uploaded = $_REQUEST['part']; } else { $uploaded = file_get_contents($filename); unlink($filename); } $logging = "... Data upload complete\n"; $address = str_replace(".", "-", $_SERVER['REMOTE_ADDR']); $address = str_replace(":", "_", $address); $time = time(); $day = floor($time / ONE_DAY); $byte_counts = array(); if (isset($_REQUEST['byte_counts'])) { $byte_counts = unserialize(webdecode($_REQUEST['byte_counts'])); } $robot_data = ""; $cache_page_validation_data = ""; $schedule_data = ""; $index_data = ""; if (isset($byte_counts["TOTAL"]) && $byte_counts["TOTAL"] > 0) { $pos = 0; $robot_data = substr($uploaded, $pos, $byte_counts["ROBOT"]); $pos += $byte_counts["ROBOT"]; $cache_page_validation_data = substr($uploaded, $pos, $byte_counts["CACHE_PAGE_VALIDATION"]); $pos += $byte_counts["CACHE_PAGE_VALIDATION"]; $schedule_data = substr($uploaded, $pos, $byte_counts["SCHEDULE"]); $pos += $byte_counts["SCHEDULE"]; $index_data = substr($uploaded, $pos); } if (strlen($robot_data) > 0) { $this->addScheduleToScheduleDirectory(self::robot_data_base_name, $robot_data); } if (USE_ETAG_EXPIRES && strlen($cache_page_validation_data) > 0) { $this->addScheduleToScheduleDirectory(self::etag_expires_data_base_name, $cache_page_validation_data); } if (strlen($schedule_data) > 0) { $this->addScheduleToScheduleDirectory(self::schedule_data_base_name, $schedule_data); } if (strlen($index_data) > 0) { $this->addScheduleToScheduleDirectory(self::index_data_base_name, $index_data); } return $logging; }
/** * Processes a cache page validation data file. Extracts key-value pairs * from the file and inserts into the B-Tree used for storing cache * page validation data. * @param string $file is the cache page validation data file written by * Fetchers. */ function processEtagExpiresArchive($file) { crawlLog("Scheduler Processing etag expires http header data in {$file}"); $start_time = microtime(); $etag_expires_data = unserialize(gzuncompress(webdecode(file_get_contents($file)))); crawlLog("Scheduler Done uncompressing etag data." . " Starting to add to btree"); $num_entries = count($etag_expires_data); $i = 0; foreach ($etag_expires_data as $data) { crawlTimeoutLog("..Scheduler still etag processing on item %s of %s.", $i, $num_entries); $i++; $link = $data[0]; $value = $data[1]; $key = crawlHash($link, true); $entry = array($key, $value); $this->web_queue->etag_btree->insert($entry); } crawlLog(" time: " . changeInMicrotime($start_time) . "\n"); crawlLog("Scheduler Done processing etag expires http" . " header data file: {$file}"); unlink($file); }
/** * In a multiple queue server setting, gets summaries for a set of document * by their url, or by group of 5-tuples of the form * (machine, key, index, generation, offset). This makes an execMachines * call to make a network request to the CrawlController's on each machine * which in turn calls getCrawlItems (and thence nonNetworkGetCrawlItems) * on each machine. The results are then sent back to networkGetCrawlItems * and aggregated. * * @param string $lookups things whose summaries we are trying to look up * @param array $machine_urls an array of urls of yioop queue servers * @return array of summary data for the matching documents */ function networkGetCrawlItems($lookups, $machine_urls) { //Set-up network request $machines = array(); $indexes = array(); $num_machines = count($machine_urls); foreach ($lookups as $lookup => $lookup_info) { if (count($lookup_info) == 2 && ($lookup_info[0][0] === 'h' || $lookup_info[0][0] === 'r' || $lookup_info[0][0] === 'g')) { $machines = $machine_urls; break; } else { foreach ($lookup_info as $lookup_item) { $out_lookup_info = array(); if (count($lookup_item) == 5) { list($index, , , , ) = $lookup_item; $machines[$index] = $machine_urls[$index]; } else { $machines = $machine_urls; break; } } } } //Make request $page_set = $this->execMachines("getCrawlItems", $machines, serialize($lookups), $num_machines); //Aggregate results $summaries = array(); $elapsed_times = array(); if (is_array($page_set)) { foreach ($page_set as $elt) { $description_hash = array(); $result = @unserialize(webdecode($elt[self::PAGE])); if (!is_array($result)) { $elapsed_times[] = 0; continue; } $elapsed_times[] = $result["ELAPSED_TIME"]; unset($result["ELAPSED_TIME"]); $ellipsis = ""; foreach ($result as $lookup => $summary) { if (isset($summaries[$lookup])) { if (isset($summary[self::DESCRIPTION])) { $description = trim($summary[self::DESCRIPTION]); if (!isset($summaries[$lookup][self::DESCRIPTION])) { $summaries[$lookup][self::DESCRIPTION] = ""; } if (!isset($description_hash[$description])) { $summaries[$lookup][self::DESCRIPTION] = $ellipsis . $description; $ellipsis = " .. "; $description_hash[$description] = true; } } foreach ($summary as $attr => $value) { if ($attr != self::DESCRIPTION && !isset($summaries[$lookup][$attr])) { $summaries[$lookup][$attr] = $value; } } } else { $summaries[$lookup] = $summary; } } } $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES"); if ($summary_times_string) { $all_elapsed_times = unserialize($summary_times_string); } else { $all_elapsed_times = array(); } $all_elapsed_times[] = $elapsed_times; AnalyticsManager::set("SUMMARY_TIMES", serialize($all_elapsed_times)); } return $summaries; }
/** * During an archive crawl this method is used to get from the name server * a collection of pages to process. The fetcher will later process these * and send summaries to various queue_servers. * * @return array containing archive page data */ function checkArchiveScheduler() { $start_time = microtime(); /* It's still important to switch queue servers, so that we send new data to each server each time we fetch new data from the name server. */ $this->selectCurrentServerAndUpdateIfNeeded(false); $chunk = false; if (generalIsA($this->arc_type . "Iterator", "TextArchiveBundleIterator")) { $archive_iterator = $this->archive_iterator; $chunk = true; $info = array(); $max_offset = TextArchiveBundleIterator::BUFFER_SIZE + TextArchiveBundleIterator::MAX_RECORD_SIZE; if ($archive_iterator->buffer_fh && $archive_iterator->current_offset < $max_offset) { crawlLog("Local Iterator Offset: " . $archive_iterator->current_offset); crawlLog("Local Max Offset: " . $max_offset); $info[self::ARC_DATA] = $archive_iterator->nextPages(ARCHIVE_BATCH_SIZE); crawlLog("Time to get archive data from local buffer " . changeInMicrotime($start_time)); } if ($archive_iterator->buffer_fh && $archive_iterator->current_offset < $max_offset) { return $info; } if (isset($info[self::ARC_DATA]) && count($info[self::ARC_DATA]) > 0) { $arc_data = $info[self::ARC_DATA]; } crawlLog("Done processing Local Buffer, requesting more data..."); } crawlLog("Fetching Archive data from name server with request:"); $name_server = $this->name_server; $time = time(); $session = md5($time . AUTH_KEY); $prefix = $this->fetcher_num . "-"; $request = $name_server . "?c=fetch&a=archiveSchedule&time={$time}" . "&session={$session}&robot_instance=" . $prefix . ROBOT_INSTANCE . "&machine_uri=" . WEB_URI . "&crawl_time=" . $this->crawl_time . "&check_crawl_time=" . $this->check_crawl_time; crawlLog($request); $response_string = FetchUrl::getPage($request, NULL, true); if ($response_string === false) { crawlLog("The following request failed:"); crawlLog($request); return false; } if ($response_string) { $info = @unserialize($response_string); } else { $info = array(); $info[self::STATUS] = self::NO_DATA_STATE; } $this->setCrawlParamsFromArray($info); if (isset($info[self::DATA])) { /* Unpack the archive data and return it in the $info array; also write a copy to disk in case something goes wrong. */ $pages = unserialize(gzuncompress(webdecode($info[self::DATA]))); if ($chunk) { if (isset($pages[self::ARC_DATA])) { if (isset($pages[self::INI])) { $archive_iterator->setIniInfo($pages[self::INI]); } if ($pages[self::ARC_DATA]) { $archive_iterator->makeBuffer($pages[self::ARC_DATA]); } if (isset($pages[self::HEADER]) && is_array($pages[self::HEADER]) && $pages[self::HEADER] != array()) { $archive_iterator->header = $pages[self::HEADER]; } if (!$pages[self::START_PARTITION]) { $archive_iterator->nextPages(1); } if (isset($pages[self::PARTITION_NUM])) { crawlLog(" Done get data" . " from file {$pages[self::PARTITION_NUM]}"); } if (isset($pages[self::NUM_PARTITIONS])) { crawlLog(" of {$pages[self::NUM_PARTITIONS]} files."); } } if (isset($arc_data)) { $info[self::ARC_DATA] = $arc_data; } } else { $info[self::ARC_DATA] = $pages; } } else { if (isset($info['ARCHIVE_BUNDLE_ERROR'])) { crawlLog(" " . $info['ARCHIVE_BUNDLE_ERROR']); } } crawlLog("Time to fetch archive data from name server " . changeInMicrotime($start_time)); return $info; }
/** * Computes for each word in an array of words a count of the total number * of times it occurs in this crawl model's default index. * * @param array $words words to find the counts for * @param array $machine_urls machines to invoke this command on * @return array associative array of word => counts */ function countWords($words, $machine_urls = NULL) { if ($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) { $count_strings = $this->execMachines("countWords", $machine_urls, serialize(array($words, $this->index_name))); $word_counts = array(); foreach ($count_strings as $count_string) { $a_word_counts = unserialize(webdecode($count_string[self::PAGE])); if (is_array($a_word_counts)) { foreach ($a_word_counts as $word => $count) { $word_counts[$word] = isset($word_counts[$word]) ? $word_counts[$word] + $count : $count; } } } return $word_counts; } $index_archive = IndexManager::getIndex($this->index_name); $hashes = array(); $lookup = array(); foreach ($words as $word) { $tmp = crawlHash($word); $hashes[] = $tmp; $lookup[$tmp] = $word; } $word_key_counts = $index_archive->countWordKeys($hashes); $phrases = array(); $word_counts = array(); if (is_array($word_key_counts) && count($word_key_counts) > 0) { foreach ($word_key_counts as $word_key => $count) { $word_counts[$lookup[$word_key]] = $count; } } return $word_counts; }
/** * Gets a list of rss news feed info either from the local database or * from the name server (in multiple news feeder setting) * * @param bool whether to use a previously cached news list * @return array $feeds info about all the news sources this machine * is responsible for */ function getNewsSources($use_cache = true) { static $feeds = array(); if ($use_cache && $feeds != array()) { return $feeds; } if (MULTIPLE_NEWS_UPDATER) { $current_machine = file_get_contents(WORK_DIRECTORY . "/schedules/current_machine_info.txt"); $pre_feeds = $this->execMachines("getNewsSources", array(NAME_SERVER), $current_machine); $feeds = array(); if (isset($pre_feeds[0][self::PAGE])) { $feeds = unserialize(webdecode($pre_feeds[0][self::PAGE])); } } else { $db = $this->db; $sql = "SELECT * FROM MEDIA_SOURCE WHERE (TYPE='rss'\n OR TYPE='html')"; $result = $db->execute($sql); $i = 0; while ($feeds[$i] = $this->db->fetchArray($result)) { if ($feeds[$i]['TYPE'] == 'html') { list($feeds[$i]['CHANNEL_PATH'], $feeds[$i]['ITEM_PATH'], $feeds[$i]['TITLE_PATH'], $feeds[$i]['DESCRIPTION_PATH'], $feeds[$i]['LINK_PATH']) = explode("###", html_entity_decode($feeds[$i]['AUX_INFO'])); } $i++; } unset($feeds[$i]); //last one will be null } return $feeds; }
/** * Finds the next document for which to request a label, sometimes first * recording the label that the user selected for the last document. This * method should only be called via an XmlHttpRequest initiated by the edit * classifier JavaScript, and consequently it always writes out * JSON-encoded data, which is easily decoded by the page JavaScript. */ function classify() { $arg = $this->clean($_REQUEST['arg'], 'string'); $label = $this->clean($_REQUEST['label'], 'string'); if (isset($_REQUEST['index'])) { $index = $this->clean($_REQUEST['index'], 'int'); if (intval($index) == 1) { $index = $this->model("crawl")->getCurrentIndexDatabaseName(); } $source_type = $this->clean($_REQUEST['type'], 'string'); $keywords = $this->clean($_REQUEST['keywords'], 'string'); } /* The call to prepareToLabel is important; it loads all of the data required to manage the training set from disk, and also determines what will be saved *back* to disk later. */ $classifier = Classifier::getClassifier($label); $classifier->prepareToLabel(); $data = array(); switch ($arg) { case 'getdocs': /* Load documents in from a user-specified index, and find the next best one to label (for 'manual' source type), or label them all with a single label (for either the 'positive' or 'negative' source types). */ $mix_iterator = $this->buildClassifierCrawlMix($label, $index, $keywords); if ($source_type == 'manual') { $num_docs = $classifier->initBuffer($mix_iterator); $classifier->computeBufferDensities(); $data['num_docs'] = $num_docs; list($new_doc, $disagreement) = $classifier->findNextDocumentToLabel(); if ($new_doc) { $score = $classifier->classify($new_doc); $data['new_doc'] = $this->prepareUnlabelledDocument($new_doc, $score, $disagreement, $index, $keywords); } Classifier::setClassifier($classifier); } else { if ($source_type == 'positive' || $source_type == 'negative') { $doc_label = $source_type == 'positive' ? 1 : -1; $add_count = $classifier->addAllDocuments($mix_iterator, $doc_label); if ($add_count > 0) { /* Pass true to always update accuracy after adding a batch of documents all at once. */ $classifier->train(true); Classifier::setClassifier($classifier); } $data['add_count'] = $add_count; } } break; case 'addlabel': /* First label the last candidate document presented to the user (potentially skipping it instead of actually applying a label), then pick the next best candidate for labeling. When skipping a document instead of adding a label, avoid re-training since the training set hasn't actually changed. */ $doc = $_REQUEST['doc_to_label']; $docid = $this->clean($doc['docid'], 'int'); $key = webdecode($this->clean($doc['key'], 'string')); $doc_label = $this->clean($doc['label'], 'int'); $mix_iterator = $this->retrieveClassifierCrawlMix($label); $labels_changed = $classifier->labelDocument($key, $doc_label); $num_docs = $classifier->refreshBuffer($mix_iterator); $classifier->computeBufferDensities(); $data['num_docs'] = $num_docs; if ($labels_changed) { $update_accuracy = $classifier->total > 0 && $classifier->total % 10 == 0; $classifier->train($update_accuracy); } list($new_doc, $disagreement) = $classifier->findNextDocumentToLabel(); if ($new_doc) { $score = $classifier->classify($new_doc); $data['new_doc'] = $this->prepareUnlabelledDocument($new_doc, $score, $disagreement, $index, $keywords); } Classifier::setClassifier($classifier); break; case 'updateaccuracy': /* Don't do anything other than re-compute the accuracy for the current training set. */ $classifier->updateAccuracy(); Classifier::setClassifier($classifier); break; } /* No matter which activity we ended up carrying out, always include the statistics that *might* have changed so that the client can just naively keep them up to date. */ $data['positive'] = $classifier->positive; $data['negative'] = $classifier->negative; $data['total'] = $classifier->total; $data['accuracy'] = $classifier->accuracy; /* Pass along a new authentication token so that the client can make a new authenticated request after this one. */ $data['authTime'] = strval(time()); $data['authSession'] = md5($data['authTime'] . AUTH_KEY); $response = json_encode($data); header("Content-Type: application/json"); header("Content-Length: " . strlen($response)); echo $response; }
/** * Handles the request to get the array of news feed sources which hash to * a particular value i.e. match with the index of requesting machine's * hashed url/name from array of available machines hash */ function getNewsSources() { if (!isset($_REQUEST["arg"])) { return; } $source_model = $this->model("source"); $current_machine = $this->clean(webdecode($_REQUEST["arg"]), "string"); $machine_hashes = $source_model->getMachineHashUrls(); $machine_index_match = array_search($current_machine, $machine_hashes); if ($machine_index_match === false) { echo webencode(serialize(array())); return; } $num_machines = count($machine_hashes); $pre_feeds = $source_model->getMediaSources("rss"); $pre_feeds = array_merge($pre_feeds, $source_model->getMediaSources("html")); if (!$pre_feeds) { return false; } $feeds = array(); foreach ($pre_feeds as $pre_feed) { if (!isset($pre_feed['NAME'])) { continue; } $hash_int = unpack("N", crawlHash($pre_feed['NAME'])); if (!isset($hash_int[1])) { continue; } $hash_index = $hash_int[1] % $num_machines; if ($machine_index_match != $hash_index) { continue; } if ($pre_feed['TYPE'] == 'html') { list($pre_feed['CHANNEL_PATH'], $pre_feed['ITEM_PATH'], $pre_feed['TITLE_PATH'], $pre_feed['DESCRIPTION_PATH'], $pre_feed['LINK_PATH']) = explode("###", html_entity_decode($pre_feed['AUX_INFO'])); } $feeds[] = $pre_feed; } echo webencode(serialize($feeds)); }
/** * The dual of loadClassifiersData, this static method reconstitutes a * Classifier instance from an array containing the necessary data. This * gets called by each fetcher, using the data that it receives from the * name server when establishing a new crawl. * * @param array $data associative array mapping property names to their * serialized and compressed data * @return object Classifier instance built from the passed-in data */ static function newClassifierFromData($data) { if (!isset($data['classifier'])) { return NULL; } $classifier = unserialize(webdecode($data['classifier'])); unset($data['classifier']); foreach ($data as $field => $field_data) { $field_data = webdecode($field_data); $serialized_data = gzuncompress($field_data); $classifier->{$field} = unserialize($serialized_data); } $classifier->loaded_properties = array_keys($data); return $classifier; }