Exemple #1
0
 /**
  * After robot, schedule, and index data have been uploaded and reassembled
  * as one big data file/string, this function splits that string into
  * each of these data types and then save the result into the appropriate
  * schedule sub-folder. Any temporary files used during uploading are then
  * deleted.
  *
  * @param string $filename name of temp file used to upload big string.
  *     If uploaded data was small enough to be uploaded in one go, then
  *     this should be "" -- the variable $_REQUEST["part"] will be used
  *     instead
  * @return string $logging diagnostic info to be sent to fetcher about
  *     what was done
  */
 function handleUploadedData($filename = "")
 {
     if ($filename == "") {
         $uploaded = $_REQUEST['part'];
     } else {
         $uploaded = file_get_contents($filename);
         unlink($filename);
     }
     $logging = "... Data upload complete\n";
     $address = str_replace(".", "-", $_SERVER['REMOTE_ADDR']);
     $address = str_replace(":", "_", $address);
     $time = time();
     $day = floor($time / ONE_DAY);
     $byte_counts = array();
     if (isset($_REQUEST['byte_counts'])) {
         $byte_counts = unserialize(webdecode($_REQUEST['byte_counts']));
     }
     $robot_data = "";
     $cache_page_validation_data = "";
     $schedule_data = "";
     $index_data = "";
     if (isset($byte_counts["TOTAL"]) && $byte_counts["TOTAL"] > 0) {
         $pos = 0;
         $robot_data = substr($uploaded, $pos, $byte_counts["ROBOT"]);
         $pos += $byte_counts["ROBOT"];
         $cache_page_validation_data = substr($uploaded, $pos, $byte_counts["CACHE_PAGE_VALIDATION"]);
         $pos += $byte_counts["CACHE_PAGE_VALIDATION"];
         $schedule_data = substr($uploaded, $pos, $byte_counts["SCHEDULE"]);
         $pos += $byte_counts["SCHEDULE"];
         $index_data = substr($uploaded, $pos);
     }
     if (strlen($robot_data) > 0) {
         $this->addScheduleToScheduleDirectory(self::robot_data_base_name, $robot_data);
     }
     if (USE_ETAG_EXPIRES && strlen($cache_page_validation_data) > 0) {
         $this->addScheduleToScheduleDirectory(self::etag_expires_data_base_name, $cache_page_validation_data);
     }
     if (strlen($schedule_data) > 0) {
         $this->addScheduleToScheduleDirectory(self::schedule_data_base_name, $schedule_data);
     }
     if (strlen($index_data) > 0) {
         $this->addScheduleToScheduleDirectory(self::index_data_base_name, $index_data);
     }
     return $logging;
 }
Exemple #2
0
 /**
  * Processes a cache page validation data file. Extracts key-value pairs
  * from the file and inserts into the B-Tree used for storing cache
  * page validation data.
  * @param string $file is the cache page validation data file written by
  * Fetchers.
  */
 function processEtagExpiresArchive($file)
 {
     crawlLog("Scheduler Processing etag expires http header data in {$file}");
     $start_time = microtime();
     $etag_expires_data = unserialize(gzuncompress(webdecode(file_get_contents($file))));
     crawlLog("Scheduler Done uncompressing etag data." . " Starting to add to btree");
     $num_entries = count($etag_expires_data);
     $i = 0;
     foreach ($etag_expires_data as $data) {
         crawlTimeoutLog("..Scheduler still etag processing on item %s of %s.", $i, $num_entries);
         $i++;
         $link = $data[0];
         $value = $data[1];
         $key = crawlHash($link, true);
         $entry = array($key, $value);
         $this->web_queue->etag_btree->insert($entry);
     }
     crawlLog(" time: " . changeInMicrotime($start_time) . "\n");
     crawlLog("Scheduler Done processing etag expires http" . " header data file: {$file}");
     unlink($file);
 }
Exemple #3
0
 /**
  * In a multiple queue server setting, gets summaries for a set of document
  * by their url, or by group of 5-tuples of the form
  * (machine, key, index, generation, offset). This makes an execMachines
  * call to make a network request to the CrawlController's on each machine
  * which in turn calls getCrawlItems (and thence nonNetworkGetCrawlItems)
  * on each machine. The results are then sent back to networkGetCrawlItems
  * and aggregated.
  *
  * @param string $lookups things whose summaries we are trying to look up
  * @param array $machine_urls an array of urls of yioop queue servers
  * @return array of summary data for the matching documents
  */
 function networkGetCrawlItems($lookups, $machine_urls)
 {
     //Set-up network request
     $machines = array();
     $indexes = array();
     $num_machines = count($machine_urls);
     foreach ($lookups as $lookup => $lookup_info) {
         if (count($lookup_info) == 2 && ($lookup_info[0][0] === 'h' || $lookup_info[0][0] === 'r' || $lookup_info[0][0] === 'g')) {
             $machines = $machine_urls;
             break;
         } else {
             foreach ($lookup_info as $lookup_item) {
                 $out_lookup_info = array();
                 if (count($lookup_item) == 5) {
                     list($index, , , , ) = $lookup_item;
                     $machines[$index] = $machine_urls[$index];
                 } else {
                     $machines = $machine_urls;
                     break;
                 }
             }
         }
     }
     //Make request
     $page_set = $this->execMachines("getCrawlItems", $machines, serialize($lookups), $num_machines);
     //Aggregate results
     $summaries = array();
     $elapsed_times = array();
     if (is_array($page_set)) {
         foreach ($page_set as $elt) {
             $description_hash = array();
             $result = @unserialize(webdecode($elt[self::PAGE]));
             if (!is_array($result)) {
                 $elapsed_times[] = 0;
                 continue;
             }
             $elapsed_times[] = $result["ELAPSED_TIME"];
             unset($result["ELAPSED_TIME"]);
             $ellipsis = "";
             foreach ($result as $lookup => $summary) {
                 if (isset($summaries[$lookup])) {
                     if (isset($summary[self::DESCRIPTION])) {
                         $description = trim($summary[self::DESCRIPTION]);
                         if (!isset($summaries[$lookup][self::DESCRIPTION])) {
                             $summaries[$lookup][self::DESCRIPTION] = "";
                         }
                         if (!isset($description_hash[$description])) {
                             $summaries[$lookup][self::DESCRIPTION] = $ellipsis . $description;
                             $ellipsis = " .. ";
                             $description_hash[$description] = true;
                         }
                     }
                     foreach ($summary as $attr => $value) {
                         if ($attr != self::DESCRIPTION && !isset($summaries[$lookup][$attr])) {
                             $summaries[$lookup][$attr] = $value;
                         }
                     }
                 } else {
                     $summaries[$lookup] = $summary;
                 }
             }
         }
         $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES");
         if ($summary_times_string) {
             $all_elapsed_times = unserialize($summary_times_string);
         } else {
             $all_elapsed_times = array();
         }
         $all_elapsed_times[] = $elapsed_times;
         AnalyticsManager::set("SUMMARY_TIMES", serialize($all_elapsed_times));
     }
     return $summaries;
 }
Exemple #4
0
 /**
  * During an archive crawl this method is used to get from the name server
  * a collection of pages to process. The fetcher will later process these
  * and send summaries to various queue_servers.
  *
  * @return array containing archive page data
  */
 function checkArchiveScheduler()
 {
     $start_time = microtime();
     /*
         It's still important to switch queue servers, so that we send new
         data to each server each time we fetch
         new data from the name server.
     */
     $this->selectCurrentServerAndUpdateIfNeeded(false);
     $chunk = false;
     if (generalIsA($this->arc_type . "Iterator", "TextArchiveBundleIterator")) {
         $archive_iterator = $this->archive_iterator;
         $chunk = true;
         $info = array();
         $max_offset = TextArchiveBundleIterator::BUFFER_SIZE + TextArchiveBundleIterator::MAX_RECORD_SIZE;
         if ($archive_iterator->buffer_fh && $archive_iterator->current_offset < $max_offset) {
             crawlLog("Local Iterator Offset: " . $archive_iterator->current_offset);
             crawlLog("Local Max Offset: " . $max_offset);
             $info[self::ARC_DATA] = $archive_iterator->nextPages(ARCHIVE_BATCH_SIZE);
             crawlLog("Time to get archive data from local buffer " . changeInMicrotime($start_time));
         }
         if ($archive_iterator->buffer_fh && $archive_iterator->current_offset < $max_offset) {
             return $info;
         }
         if (isset($info[self::ARC_DATA]) && count($info[self::ARC_DATA]) > 0) {
             $arc_data = $info[self::ARC_DATA];
         }
         crawlLog("Done processing Local Buffer, requesting more data...");
     }
     crawlLog("Fetching Archive data from name server with request:");
     $name_server = $this->name_server;
     $time = time();
     $session = md5($time . AUTH_KEY);
     $prefix = $this->fetcher_num . "-";
     $request = $name_server . "?c=fetch&a=archiveSchedule&time={$time}" . "&session={$session}&robot_instance=" . $prefix . ROBOT_INSTANCE . "&machine_uri=" . WEB_URI . "&crawl_time=" . $this->crawl_time . "&check_crawl_time=" . $this->check_crawl_time;
     crawlLog($request);
     $response_string = FetchUrl::getPage($request, NULL, true);
     if ($response_string === false) {
         crawlLog("The following request failed:");
         crawlLog($request);
         return false;
     }
     if ($response_string) {
         $info = @unserialize($response_string);
     } else {
         $info = array();
         $info[self::STATUS] = self::NO_DATA_STATE;
     }
     $this->setCrawlParamsFromArray($info);
     if (isset($info[self::DATA])) {
         /* Unpack the archive data and return it in the $info array; also
            write a copy to disk in case something goes wrong. */
         $pages = unserialize(gzuncompress(webdecode($info[self::DATA])));
         if ($chunk) {
             if (isset($pages[self::ARC_DATA])) {
                 if (isset($pages[self::INI])) {
                     $archive_iterator->setIniInfo($pages[self::INI]);
                 }
                 if ($pages[self::ARC_DATA]) {
                     $archive_iterator->makeBuffer($pages[self::ARC_DATA]);
                 }
                 if (isset($pages[self::HEADER]) && is_array($pages[self::HEADER]) && $pages[self::HEADER] != array()) {
                     $archive_iterator->header = $pages[self::HEADER];
                 }
                 if (!$pages[self::START_PARTITION]) {
                     $archive_iterator->nextPages(1);
                 }
                 if (isset($pages[self::PARTITION_NUM])) {
                     crawlLog("  Done get data" . " from file {$pages[self::PARTITION_NUM]}");
                 }
                 if (isset($pages[self::NUM_PARTITIONS])) {
                     crawlLog("  of {$pages[self::NUM_PARTITIONS]} files.");
                 }
             }
             if (isset($arc_data)) {
                 $info[self::ARC_DATA] = $arc_data;
             }
         } else {
             $info[self::ARC_DATA] = $pages;
         }
     } else {
         if (isset($info['ARCHIVE_BUNDLE_ERROR'])) {
             crawlLog("  " . $info['ARCHIVE_BUNDLE_ERROR']);
         }
     }
     crawlLog("Time to fetch archive data from name server " . changeInMicrotime($start_time));
     return $info;
 }
Exemple #5
0
 /**
  * Computes for each word in an array of words a count of the total number
  * of times it occurs in this crawl model's default index.
  *
  * @param array $words words to find the counts for
  * @param array $machine_urls machines to invoke this command on
  * @return array associative array of word => counts
  */
 function countWords($words, $machine_urls = NULL)
 {
     if ($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) {
         $count_strings = $this->execMachines("countWords", $machine_urls, serialize(array($words, $this->index_name)));
         $word_counts = array();
         foreach ($count_strings as $count_string) {
             $a_word_counts = unserialize(webdecode($count_string[self::PAGE]));
             if (is_array($a_word_counts)) {
                 foreach ($a_word_counts as $word => $count) {
                     $word_counts[$word] = isset($word_counts[$word]) ? $word_counts[$word] + $count : $count;
                 }
             }
         }
         return $word_counts;
     }
     $index_archive = IndexManager::getIndex($this->index_name);
     $hashes = array();
     $lookup = array();
     foreach ($words as $word) {
         $tmp = crawlHash($word);
         $hashes[] = $tmp;
         $lookup[$tmp] = $word;
     }
     $word_key_counts = $index_archive->countWordKeys($hashes);
     $phrases = array();
     $word_counts = array();
     if (is_array($word_key_counts) && count($word_key_counts) > 0) {
         foreach ($word_key_counts as $word_key => $count) {
             $word_counts[$lookup[$word_key]] = $count;
         }
     }
     return $word_counts;
 }
Exemple #6
0
 /**
  * Gets a list of rss news feed info either from the local database or
  * from the name server (in multiple news feeder setting)
  *
  * @param bool whether to use a previously cached news list
  * @return array $feeds info about all the news sources this machine
  *      is responsible for
  */
 function getNewsSources($use_cache = true)
 {
     static $feeds = array();
     if ($use_cache && $feeds != array()) {
         return $feeds;
     }
     if (MULTIPLE_NEWS_UPDATER) {
         $current_machine = file_get_contents(WORK_DIRECTORY . "/schedules/current_machine_info.txt");
         $pre_feeds = $this->execMachines("getNewsSources", array(NAME_SERVER), $current_machine);
         $feeds = array();
         if (isset($pre_feeds[0][self::PAGE])) {
             $feeds = unserialize(webdecode($pre_feeds[0][self::PAGE]));
         }
     } else {
         $db = $this->db;
         $sql = "SELECT * FROM MEDIA_SOURCE WHERE (TYPE='rss'\n                 OR TYPE='html')";
         $result = $db->execute($sql);
         $i = 0;
         while ($feeds[$i] = $this->db->fetchArray($result)) {
             if ($feeds[$i]['TYPE'] == 'html') {
                 list($feeds[$i]['CHANNEL_PATH'], $feeds[$i]['ITEM_PATH'], $feeds[$i]['TITLE_PATH'], $feeds[$i]['DESCRIPTION_PATH'], $feeds[$i]['LINK_PATH']) = explode("###", html_entity_decode($feeds[$i]['AUX_INFO']));
             }
             $i++;
         }
         unset($feeds[$i]);
         //last one will be null
     }
     return $feeds;
 }
 /**
  * Finds the next document for which to request a label, sometimes first
  * recording the label that the user selected for the last document. This
  * method should only be called via an XmlHttpRequest initiated by the edit
  * classifier JavaScript, and consequently it always writes out
  * JSON-encoded data, which is easily decoded by the page JavaScript.
  */
 function classify()
 {
     $arg = $this->clean($_REQUEST['arg'], 'string');
     $label = $this->clean($_REQUEST['label'], 'string');
     if (isset($_REQUEST['index'])) {
         $index = $this->clean($_REQUEST['index'], 'int');
         if (intval($index) == 1) {
             $index = $this->model("crawl")->getCurrentIndexDatabaseName();
         }
         $source_type = $this->clean($_REQUEST['type'], 'string');
         $keywords = $this->clean($_REQUEST['keywords'], 'string');
     }
     /*
       The call to prepareToLabel is important; it loads all of the data
       required to manage the training set from disk, and also determines
       what will be saved *back* to disk later.
     */
     $classifier = Classifier::getClassifier($label);
     $classifier->prepareToLabel();
     $data = array();
     switch ($arg) {
         case 'getdocs':
             /*
               Load documents in from a user-specified index, and find the
               next best one to label (for 'manual' source type), or label
               them all with a single label (for either the 'positive' or
               'negative' source types).
             */
             $mix_iterator = $this->buildClassifierCrawlMix($label, $index, $keywords);
             if ($source_type == 'manual') {
                 $num_docs = $classifier->initBuffer($mix_iterator);
                 $classifier->computeBufferDensities();
                 $data['num_docs'] = $num_docs;
                 list($new_doc, $disagreement) = $classifier->findNextDocumentToLabel();
                 if ($new_doc) {
                     $score = $classifier->classify($new_doc);
                     $data['new_doc'] = $this->prepareUnlabelledDocument($new_doc, $score, $disagreement, $index, $keywords);
                 }
                 Classifier::setClassifier($classifier);
             } else {
                 if ($source_type == 'positive' || $source_type == 'negative') {
                     $doc_label = $source_type == 'positive' ? 1 : -1;
                     $add_count = $classifier->addAllDocuments($mix_iterator, $doc_label);
                     if ($add_count > 0) {
                         /*
                           Pass true to always update accuracy after adding a
                           batch of documents all at once.
                         */
                         $classifier->train(true);
                         Classifier::setClassifier($classifier);
                     }
                     $data['add_count'] = $add_count;
                 }
             }
             break;
         case 'addlabel':
             /*
               First label the last candidate document presented to the
               user (potentially skipping it instead of actually applying a
               label), then pick the next best candidate for labeling.
               When skipping a document instead of adding a label, avoid
               re-training since the training set hasn't actually changed.
             */
             $doc = $_REQUEST['doc_to_label'];
             $docid = $this->clean($doc['docid'], 'int');
             $key = webdecode($this->clean($doc['key'], 'string'));
             $doc_label = $this->clean($doc['label'], 'int');
             $mix_iterator = $this->retrieveClassifierCrawlMix($label);
             $labels_changed = $classifier->labelDocument($key, $doc_label);
             $num_docs = $classifier->refreshBuffer($mix_iterator);
             $classifier->computeBufferDensities();
             $data['num_docs'] = $num_docs;
             if ($labels_changed) {
                 $update_accuracy = $classifier->total > 0 && $classifier->total % 10 == 0;
                 $classifier->train($update_accuracy);
             }
             list($new_doc, $disagreement) = $classifier->findNextDocumentToLabel();
             if ($new_doc) {
                 $score = $classifier->classify($new_doc);
                 $data['new_doc'] = $this->prepareUnlabelledDocument($new_doc, $score, $disagreement, $index, $keywords);
             }
             Classifier::setClassifier($classifier);
             break;
         case 'updateaccuracy':
             /*
               Don't do anything other than re-compute the accuracy for the
               current training set.
             */
             $classifier->updateAccuracy();
             Classifier::setClassifier($classifier);
             break;
     }
     /*
       No matter which activity we ended up carrying out, always include
       the statistics that *might* have changed so that the client can just
       naively keep them up to date.
     */
     $data['positive'] = $classifier->positive;
     $data['negative'] = $classifier->negative;
     $data['total'] = $classifier->total;
     $data['accuracy'] = $classifier->accuracy;
     /*
       Pass along a new authentication token so that the client can make a
       new authenticated request after this one.
     */
     $data['authTime'] = strval(time());
     $data['authSession'] = md5($data['authTime'] . AUTH_KEY);
     $response = json_encode($data);
     header("Content-Type: application/json");
     header("Content-Length: " . strlen($response));
     echo $response;
 }
Exemple #8
0
 /**
  * Handles the request to get the  array of news feed sources which hash to
  * a particular value i.e. match with the index of requesting machine's
  * hashed url/name from array of available machines hash
  */
 function getNewsSources()
 {
     if (!isset($_REQUEST["arg"])) {
         return;
     }
     $source_model = $this->model("source");
     $current_machine = $this->clean(webdecode($_REQUEST["arg"]), "string");
     $machine_hashes = $source_model->getMachineHashUrls();
     $machine_index_match = array_search($current_machine, $machine_hashes);
     if ($machine_index_match === false) {
         echo webencode(serialize(array()));
         return;
     }
     $num_machines = count($machine_hashes);
     $pre_feeds = $source_model->getMediaSources("rss");
     $pre_feeds = array_merge($pre_feeds, $source_model->getMediaSources("html"));
     if (!$pre_feeds) {
         return false;
     }
     $feeds = array();
     foreach ($pre_feeds as $pre_feed) {
         if (!isset($pre_feed['NAME'])) {
             continue;
         }
         $hash_int = unpack("N", crawlHash($pre_feed['NAME']));
         if (!isset($hash_int[1])) {
             continue;
         }
         $hash_index = $hash_int[1] % $num_machines;
         if ($machine_index_match != $hash_index) {
             continue;
         }
         if ($pre_feed['TYPE'] == 'html') {
             list($pre_feed['CHANNEL_PATH'], $pre_feed['ITEM_PATH'], $pre_feed['TITLE_PATH'], $pre_feed['DESCRIPTION_PATH'], $pre_feed['LINK_PATH']) = explode("###", html_entity_decode($pre_feed['AUX_INFO']));
         }
         $feeds[] = $pre_feed;
     }
     echo webencode(serialize($feeds));
 }
Exemple #9
0
 /**
  * The dual of loadClassifiersData, this static method reconstitutes a
  * Classifier instance from an array containing the necessary data. This
  * gets called by each fetcher, using the data that it receives from the
  * name server when establishing a new crawl.
  *
  * @param array $data associative array mapping property names to their
  * serialized and compressed data
  * @return object Classifier instance built from the passed-in data
  */
 static function newClassifierFromData($data)
 {
     if (!isset($data['classifier'])) {
         return NULL;
     }
     $classifier = unserialize(webdecode($data['classifier']));
     unset($data['classifier']);
     foreach ($data as $field => $field_data) {
         $field_data = webdecode($field_data);
         $serialized_data = gzuncompress($field_data);
         $classifier->{$field} = unserialize($serialized_data);
     }
     $classifier->loaded_properties = array_keys($data);
     return $classifier;
 }