Example #1
0
 /**
  * When a crawl is being shutdown, this function is called to write
  * the contents of the web queue bundle back to schedules. This allows
  * crawls to be resumed without losing urls. This function can also be
  * called if the queue gets clogged to reschedule its contents for a later
  * time.
  *
  * @param bool $for_reschedule if the call was to reschedule the urls
  *     to be crawled at a later time as opposed to being used to
  *     save the urls because the crawl is being halted.
  */
 function dumpQueueToSchedules($for_reschedule = false)
 {
     if (!$for_reschedule) {
         $this->writeAdminMessage("SHUTDOWN_QUEUE");
     }
     if (!isset($this->web_queue->to_crawl_queue)) {
         crawlLog("URL queue appears to be empty or NULL");
         return;
     }
     crawlLog("Writing queue contents back to schedules...");
     $dir = CRAWL_DIR . "/schedules/" . self::schedule_data_base_name . $this->crawl_time;
     if (!file_exists($dir)) {
         mkdir($dir);
         chmod($dir, 0777);
     }
     $now = time();
     if ($for_reschedule) {
         $day = floor($now / ONE_DAY);
         $note_string = "Reschedule";
     } else {
         $day = floor($this->crawl_time / ONE_DAY) - 1;
         //want before all other schedules, so will be reloaded first
         $note_string = "";
     }
     $dir .= "/{$day}";
     if (!file_exists($dir)) {
         mkdir($dir);
         chmod($dir, 0777);
     }
     //get rid of previous restart attempts, if present
     if (!$for_reschedule) {
         $this->db->unlinkRecursive($dir, false);
     }
     $count = $this->web_queue->to_crawl_queue->count;
     $old_time = 1;
     $schedule_data = array();
     $schedule_data[self::SCHEDULE_TIME] = $this->crawl_time;
     $schedule_data[self::TO_CRAWL] = array();
     $fh = $this->web_queue->openUrlArchive();
     for ($time = 1; $time < $count; $time++) {
         crawlTimeoutLog("..have written %s urls of %s urls so far", $time, $count);
         $tmp = $this->web_queue->peekQueue($time, $fh);
         list($url, $weight, , ) = $tmp;
         // if queue error skip
         if ($tmp === false || strcmp($url, "LOOKUP ERROR") == 0) {
             continue;
         }
         /* for fetcher hash is  a hash of link_num . hash_of_page_link_on
          * in the case below. Either the url or hash can be used to
          * determine if the page has been seen. In the case, of a dump
          * we choose hash to be something so only url affects whether
          * dedup.
          */
         $hash = crawlHash($now . $url);
         if ($for_reschedule) {
             $schedule_time = $time + $now;
         } else {
             $schedule_time = $time;
         }
         $schedule_data[self::TO_CRAWL][] = array($url, $weight, $hash);
         if ($time - $old_time >= MAX_FETCH_SIZE) {
             if (count($schedule_data[self::TO_CRAWL]) > 0) {
                 $data_string = webencode(gzcompress(serialize($schedule_data)));
                 $data_hash = crawlHash($data_string);
                 file_put_contents($dir . "/At" . $schedule_time . "From127-0-0-1" . $note_string . "WithHash{$data_hash}.txt", $data_string);
                 $data_string = "";
                 $schedule_data[self::TO_CRAWL] = array();
             }
             $old_time = $time;
         }
     }
     $this->web_queue->closeUrlArchive($fh);
     if (count($schedule_data[self::TO_CRAWL]) > 0) {
         $data_string = webencode(gzcompress(serialize($schedule_data)));
         $data_hash = crawlHash($data_string);
         if ($for_reschedule) {
             $schedule_time = $time + $now;
         } else {
             $schedule_time = $time;
         }
         file_put_contents($dir . "/At" . $schedule_time . "From127-0-0-1" . $note_string . "WithHash{$data_hash}.txt", $data_string);
     }
     $this->db->setWorldPermissionsRecursive(CRAWL_DIR . '/cache/' . self::queue_base_name . $this->crawl_time);
     $this->db->setWorldPermissionsRecursive($dir);
 }
 /**
  * Gets the next doc from the iterator
  * @param bool $no_process if true then just return page string found
  *     not any additional meta data.
  * @return mixed associative array for doc or just string of doc
  */
 function nextPage($no_process = false)
 {
     if (!$this->checkFileHandle()) {
         return NULL;
     }
     $matches = array();
     while (preg_match($this->delimiter, $this->buffer, $matches, PREG_OFFSET_CAPTURE) != 1) {
         crawlTimeoutLog("..still looking for a page in local buffer");
         $block = $this->getFileBlock();
         if (!$block || !$this->checkFileHandle() || $this->checkEof()) {
             return NULL;
         }
         $this->buffer .= $block;
     }
     $delim_len = strlen($matches[0][0]);
     $pos = $matches[0][1] + $delim_len;
     $page_pos = $this->start_delimiter == "" ? $pos : $pos - $delim_len;
     $page = substr($this->buffer, 0, $page_pos);
     if ($this->end_delimiter == "") {
         $page = $this->remainder . $page;
         $this->remainder = $matches[0][0];
     }
     $this->buffer = substr($this->buffer, $pos + $delim_len);
     if ($this->start_delimiter != "") {
         $matches = array();
         if (preg_match($this->start_delimiter, $this->buffer, $matches, PREG_OFFSET_CAPTURE) != 1) {
             if (isset($matches[0][1])) {
                 $page = substr($page, $matches[0][1]);
             }
         }
     }
     if ($no_process == true) {
         return $page;
     }
     $site = array();
     $site[self::HEADER] = "text_archive_bundle_iterator extractor";
     $site[self::IP_ADDRESSES] = array("0.0.0.0");
     $site[self::TIMESTAMP] = date("U", time());
     $site[self::TYPE] = "text/plain";
     $site[self::PAGE] = $page;
     $site[self::HASH] = FetchUrl::computePageHash($page);
     $site[self::URL] = "record:" . webencode($site[self::HASH]);
     $site[self::HTTP_CODE] = 200;
     $site[self::ENCODING] = $this->encoding;
     $site[self::SERVER] = "unknown";
     $site[self::SERVER_VERSION] = "unknown";
     $site[self::OPERATING_SYSTEM] = "unknown";
     $site[self::WEIGHT] = 1;
     return $site;
 }
Example #3
0
 /**
  * Sends to crawl, robot, and index data to the current queue server.
  * If this data is more than post_max_size, it splits it into chunks
  * which are then reassembled by the queue server web app before being
  * put into the appropriate schedule sub-directory.
  *
  * @param string $queue_server url of the current queue server
  * @param array $byte_counts has four fields: TOTAL, ROBOT, SCHEDULE,
  *     INDEX. These give the number of bytes overall for the
  *     'data' field of $post_data and for each of these components.
  * @param array $post_data data to be uploaded to the queue server web app
  */
 function uploadCrawlData($queue_server, $byte_counts, &$post_data)
 {
     $post_data['fetcher_peak_memory'] = memory_get_peak_usage();
     $post_data['byte_counts'] = webencode(serialize($byte_counts));
     $len = strlen($post_data['data']);
     $max_len = $this->post_max_size - 10 * 1024;
     // non-data post vars < 10K
     $post_data['num_parts'] = ceil($len / $max_len);
     $num_parts = $post_data['num_parts'];
     $data =& $post_data['data'];
     unset($post_data['data']);
     $post_data['hash_data'] = crawlHash($data);
     $offset = 0;
     for ($i = 1; $i <= $num_parts; $i++) {
         $time = time();
         $session = md5($time . AUTH_KEY);
         $post_data['time'] = $time;
         $post_data['session'] = $session;
         $post_data['part'] = substr($data, $offset, $max_len);
         $post_data['hash_part'] = crawlHash($post_data['part']);
         $post_data['current_part'] = $i;
         $offset += $max_len;
         $part_len = strlen($post_data['part']);
         crawlLog("Sending Queue Server Part {$i} of {$num_parts}...");
         crawlLog("...sending about {$part_len} bytes.");
         $sleep = false;
         do {
             if ($sleep == true) {
                 crawlLog("Trouble sending to the scheduler at url:");
                 crawlLog($queue_server);
                 crawlLog("Response was:");
                 crawlLog("{$info_string}");
                 $info = @unserialize($info_string);
                 $time = time();
                 $session = md5($time . AUTH_KEY);
                 $post_data['time'] = $time;
                 $post_data['session'] = $session;
                 if (isset($info[self::STATUS]) && $info[self::STATUS] == self::REDO_STATE) {
                     crawlLog("Server requested last item to be re-sent...");
                     if (isset($info[self::SUMMARY])) {
                         crawlLog($info[self::SUMMARY]);
                     }
                     crawlLog("Trying again in 5 seconds...");
                 } else {
                     crawlLog("Trying again in 5 seconds. You might want");
                     crawlLog("to check the queue server url and server");
                     crawlLog("key. Queue Server post_max_size is:" . $this->post_max_size);
                 }
                 if ($i == 1 && !defined('FORCE_SMALL') && $this->post_max_size > 1000000) {
                     /* maybe server has limited memory
                          and two high a post_max_size
                        */
                     crawlLog("Using smaller post size to see if helps");
                     define('FORCE_SMALL', true);
                     $this->post_max_size = 1000000;
                     $info[self::POST_MAX_SIZE] = 1000001;
                     /* set to small value before try again.
                      */
                 }
                 sleep(5);
             }
             $sleep = true;
             $info_string = FetchUrl::getPage($queue_server, $post_data, true);
             $info = unserialize(trim($info_string));
             if (isset($info[self::LOGGING])) {
                 crawlLog("Messages from Fetch Controller:");
                 crawlLog($info[self::LOGGING]);
             }
             if (isset($info[self::POST_MAX_SIZE]) && $this->post_max_size > $info[self::POST_MAX_SIZE]) {
                 if (!defined('FORCE_SMALL')) {
                     crawlLog("post_max_size has changed was " . "{$this->post_max_size}. Now is " . $info[self::POST_MAX_SIZE] . ".");
                     $this->post_max_size = $info[self::POST_MAX_SIZE];
                 } else {
                     crawlLog("...Using Force Small Rule on Server Posting");
                 }
                 if ($max_len > $this->post_max_size) {
                     crawlLog("Restarting upload...");
                     if (isset($post_data["resized_once"])) {
                         crawlLog("Restart failed");
                         return;
                     }
                     $post_data['data'] = $data;
                     $post_data["resized_once"] = true;
                     return $this->uploadCrawlData($queue_server, $byte_counts, $post_data);
                 }
             }
         } while (!isset($info[self::STATUS]) || $info[self::STATUS] != self::CONTINUE_STATE);
         crawlLog("Queue Server info response code: " . $info[self::STATUS]);
         crawlLog("Queue Server's crawl time is: " . $info[self::CRAWL_TIME]);
         crawlLog("Web Server peak memory usage: " . $info[self::MEMORY_USAGE]);
         crawlLog("This fetcher peak memory usage: " . memory_get_peak_usage());
     }
     crawlLog("Updated Queue Server, sent approximately" . " {$byte_counts['TOTAL']} bytes:");
 }
Example #4
0
 /**
  * Checks to see whether there are more pages to extract from the current
  * archive, and if so returns the next batch to the requesting fetcher. The
  * iteration progress is automatically saved on each call to nextPages, so
  * that the next fetcher will get the next batch of pages. If there is no
  * current archive to iterate over, or the iterator has reached the end of
  * the archive then indicate that there is no more data by setting the
  * status to NO_DATA_STATE.
  */
 function archiveSchedule()
 {
     $view = "fetch";
     $request_start = time();
     if (isset($_REQUEST['crawl_time'])) {
         $crawl_time = substr($this->clean($_REQUEST['crawl_time'], 'int'), 0, TIMESTAMP_LEN);
     } else {
         $crawl_time = 0;
     }
     $messages_filename = CRAWL_DIR . '/schedules/name_server_messages.txt';
     $lock_filename = WORK_DIRECTORY . "/schedules/name_server_lock.txt";
     if ($crawl_time > 0 && file_exists($messages_filename)) {
         $fetch_pages = true;
         $info = unserialize(file_get_contents($messages_filename));
         if ($info[self::STATUS] == 'STOP_CRAWL') {
             /* The stop crawl message gets created by the admin_controller
                when the "stop crawl" button is pressed.*/
             if (file_exists($messages_filename)) {
                 unlink($messages_filename);
             }
             if (file_exists($lock_filename)) {
                 unlink($lock_filename);
             }
             $fetch_pages = false;
             $info = array();
         }
         $this->checkRestart(self::ARCHIVE_CRAWL);
     } else {
         $fetch_pages = false;
         $info = array();
     }
     $pages = array();
     $got_lock = true;
     if (file_exists($lock_filename)) {
         $lock_time = unserialize(file_get_contents($lock_filename));
         if ($request_start - $lock_time < ini_get('max_execution_time')) {
             $got_lock = false;
         }
     }
     $chunk = false;
     $archive_iterator = NULL;
     if ($fetch_pages && $got_lock) {
         file_put_contents($lock_filename, serialize($request_start));
         if ($info[self::ARC_DIR] == "MIX" || file_exists($info[self::ARC_DIR])) {
             $iterate_timestamp = $info[self::CRAWL_INDEX];
             $result_timestamp = $crawl_time;
             $result_dir = WORK_DIRECTORY . "/schedules/" . self::name_archive_iterator . $crawl_time;
             $arctype = $info[self::ARC_TYPE];
             $iterator_name = $arctype . "Iterator";
             if (!class_exists($iterator_name)) {
                 $info['ARCHIVE_BUNDLE_ERROR'] = "Invalid bundle iterator: '{$iterator_name}'";
             } else {
                 if ($info[self::ARC_DIR] == "MIX") {
                     //recrawl of crawl mix case
                     $archive_iterator = new $iterator_name($iterate_timestamp, $result_timestamp);
                 } else {
                     //any other archive crawl except web archive recrawls
                     $archive_iterator = new $iterator_name($iterate_timestamp, $info[self::ARC_DIR], $result_timestamp, $result_dir);
                 }
             }
         }
         $pages = false;
         if ($archive_iterator && !$archive_iterator->end_of_iterator) {
             if (generalIsA($archive_iterator, "TextArchiveBundleIterator")) {
                 $pages = $archive_iterator->nextChunk();
                 $chunk = true;
             } else {
                 $pages = $archive_iterator->nextPages(ARCHIVE_BATCH_SIZE);
             }
         }
         if (file_exists($lock_filename)) {
             unlink($lock_filename);
         }
     }
     if ($archive_iterator && $archive_iterator->end_of_iterator) {
         $info[self::END_ITERATOR] = true;
     }
     if ($chunk && $pages || $pages && !empty($pages)) {
         $pages_string = webencode(gzcompress(serialize($pages)));
     } else {
         $info[self::STATUS] = self::NO_DATA_STATE;
         $info[self::POST_MAX_SIZE] = metricToInt(ini_get("post_max_size"));
         $pages = array();
         $pages_string = webencode(gzcompress(serialize($pages)));
     }
     $info[self::DATA] = $pages_string;
     $info_string = serialize($info);
     $data['MESSAGE'] = $info_string;
     $this->displayView($view, $data);
 }
Example #5
0
 /**
  * Stores in the cache a key-value pair
  *
  * Only when a key is set is there a check for whether to invalidate
  * a cache bin. It is deleted as invalid if the following two conditions
  * both hold:
  * The last time it was expired is more than SECONDS_IN_A_BIN seconds ago,
  * and the number of cache items is more than self::MAX_FILES_IN_A_BIN.
  *
  * @param string $key to associate with value
  * @param mixed $value to store
  */
 function set($key, $value)
 {
     $checksum_block = $this->checksum($key);
     $checksum_dir = $this->dir_name . "/{$checksum_block}";
     if (file_exists("{$checksum_dir}/last_expired.txt")) {
         $data = unserialize(file_get_contents("{$checksum_dir}/last_expired.txt"));
     }
     if (!isset($data['last_expired'])) {
         $data = array('last_expired' => time(), 'count' => 0);
     }
     if (time() - $data['last_expired'] > MIN_QUERY_CACHE_TIME && $data['count'] > self::MAX_FILES_IN_A_BIN || $data['count'] > 10 * self::MAX_FILES_IN_A_BIN) {
         $db_class = ucfirst(DBMS) . "Manager";
         $db = new $db_class();
         $db->unlinkRecursive($checksum_dir);
     }
     $data['count']++;
     if (!file_exists($checksum_dir)) {
         mkdir($checksum_dir);
         $data['last_expired'] = time();
     }
     file_put_contents("{$checksum_dir}/last_expired.txt", serialize($data));
     $cache_file = "{$checksum_dir}/" . webencode($key);
     file_put_contents($cache_file, serialize($value));
 }
Example #6
0
 /**
  * Gets the next at most $num many docs from the iterator. It might return
  * less than $num many documents if the partition changes or the end of the
  * bundle is reached.
  *
  * @param int $num number of docs to get
  * @param bool $no_process do not do any processing on page data
  * @return array associative arrays for $num pages
  */
 function nextPages($num, $no_process = false)
 {
     $pages = array();
     $page_count = 0;
     $db = $this->db;
     $query = "{$this->sql} " . $db->limitOffset($this->limit, $num);
     $result = $db->execute($query);
     $i = 0;
     while ($row = $db->fetchArray($result)) {
         crawlTimeoutLog("..Still getting pages from archive iterator. At %s" . " of %s", $i, $num);
         $page = "";
         foreach ($row as $key => $value) {
             $page .= "{$key}{$this->field_value_separator}" . "{$value}{$this->column_separator}";
         }
         if ($no_process) {
             $pages[] = $page;
         } else {
             $site = array();
             $site[self::HEADER] = "database_bundle_iterator extractor";
             $site[self::IP_ADDRESSES] = array("0.0.0.0");
             $site[self::TIMESTAMP] = date("U", time());
             $site[self::TYPE] = "text/plain";
             $site[self::PAGE] = $page;
             $site[self::HASH] = FetchUrl::computePageHash($page);
             $site[self::URL] = "record:" . webencode($site[self::HASH]);
             $site[self::HTTP_CODE] = 200;
             $site[self::ENCODING] = $this->encoding;
             $site[self::SERVER] = "unknown";
             $site[self::SERVER_VERSION] = "unknown";
             $site[self::OPERATING_SYSTEM] = "unknown";
             $site[self::WEIGHT] = 1;
             $pages[] = $site;
         }
         $page_count++;
     }
     $this->limit += $page_count;
     if ($page_count < $num) {
         $this->end_of_iterator = true;
     }
     $this->saveCheckpoint();
     return $pages;
 }
Example #7
0
 /**
  * This method is invoked by other ParallelModel (@see CrawlModel
  * for examples) methods when they want to have their method performed
  * on an array of other  Yioop instances. The results returned can then
  * be aggregated.  The invocation sequence is
  * crawlModelMethodA invokes execMachine with a list of
  * urls of other Yioop instances. execMachine makes REST requests of
  * those instances of the given command and optional arguments
  * This request would be handled by a CrawlController which in turn
  * calls crawlModelMethodA on the given Yioop instance, serializes the
  * result and gives it back to execMachine and then back to the originally
  * calling function.
  *
  * @param string $command the ParallelModel method to invoke on the remote
  *     Yioop instances
  * @param array $machine_urls machines to invoke this command on
  * @param string $arg additional arguments to be passed to the remote
  *      machine
  * @param int $num_machines the integer to be used in calculating partition
  * @return array a list of outputs from each machine that was called.
  */
 function execMachines($command, $machine_urls, $arg = NULL, $num_machines = 0)
 {
     if ($num_machines == 0) {
         $num_machines = count($machine_urls);
     }
     $time = time();
     $session = md5($time . AUTH_KEY);
     $query = "c=crawl&a={$command}&time={$time}&session={$session}" . "&num={$num_machines}";
     if ($arg != NULL) {
         $arg = webencode($arg);
         $query .= "&arg={$arg}";
     }
     $sites = array();
     $post_data = array();
     $i = 0;
     foreach ($machine_urls as $index => $machine_url) {
         $sites[$i][CrawlConstants::URL] = $machine_url;
         $post_data[$i] = $query . "&i={$index}";
         $i++;
     }
     $outputs = array();
     if (count($sites) > 0) {
         $outputs = FetchUrl::getPages($sites, false, 0, NULL, self::URL, self::PAGE, true, $post_data);
     }
     return $outputs;
 }
Example #8
0
 /**
  * Add the provided urls to the schedule directory of URLs that will
  * be crawled
  *
  * @param string $timestamp Unix timestamp of crawl to add to schedule of
  * @param array $inject_urls urls to be added to the schedule of
  *     the active crawl
  * @param array $machine_urls an array of urls of yioop queue servers
  */
 function injectUrlsCurrentCrawl($timestamp, $inject_urls, $machine_urls = NULL)
 {
     if ($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls, $timestamp)) {
         $this->execMachines("injectUrlsCurrentCrawl", $machine_urls, serialize(array($timestamp, $inject_urls)));
         return;
     }
     $dir = CRAWL_DIR . "/schedules/" . self::schedule_data_base_name . $timestamp;
     if (!file_exists($dir)) {
         mkdir($dir);
         chmod($dir, 0777);
     }
     $day = floor($timestamp / ONE_DAY) - 1;
     /* want before all other schedules,
        execute next */
     $dir .= "/{$day}";
     if (!file_exists($dir)) {
         mkdir($dir);
         chmod($dir, 0777);
     }
     $count = count($inject_urls);
     if ($count > 0) {
         $now = time();
         $schedule_data = array();
         $schedule_data[self::SCHEDULE_TIME] = $timestamp;
         $schedule_data[self::TO_CRAWL] = array();
         for ($i = 0; $i < $count; $i++) {
             $url = $inject_urls[$i];
             $hash = crawlHash($now . $url);
             $schedule_data[self::TO_CRAWL][] = array($url, 1, $hash);
         }
         $data_string = webencode(gzcompress(serialize($schedule_data)));
         $data_hash = crawlHash($data_string);
         file_put_contents($dir . "/At1From127-0-0-1" . "WithHash{$data_hash}.txt", $data_string);
         return true;
     }
     return false;
 }
Example #9
0
 /**
  * Creates a fresh array from an existing page summary array, and augments
  * it with extra data relevant to the labeling interface on the client.
  *
  * @param array $page original page summary array
  * @param float $score classification score (estimated by the Naive Bayes
  * text classification algorithm) for $page
  * @param float $disagreement disagreement score computed for $page
  * @param int $crawl_time index the page came from
  * @param string $keywords query supplied to the crawl mix used to find
  * $page
  * @return array reduced page summary structure containing only the
  * information that the client needs to display a summary of the page
  */
 function prepareUnlabelledDocument($page, $score, $disagreement, $crawl_time, $keywords)
 {
     $phrase_model = $this->model("phrase");
     // Highlight the query keywords, if any.
     $disjunct_phrases = explode("|", $keywords);
     $words = array();
     foreach ($disjunct_phrases as $disjunct_phrase) {
         list($word_struct, $format_words) = $phrase_model->parseWordStructConjunctiveQuery($disjunct_phrase);
         $words = array_merge($words, $format_words);
     }
     $title = $phrase_model->boldKeywords($page[self::TITLE], $words);
     $description = $phrase_model->getSnippets(strip_tags($page[self::DESCRIPTION]), $words, 400);
     $description = $phrase_model->boldKeywords($description, $words);
     $cache_link = "?c=search&amp;a=cache" . "&amp;q=" . urlencode($keywords) . "&amp;arg=" . urlencode($page[self::URL]) . "&amp;its=" . $crawl_time;
     /*
       Note that the confidence is a transformation of the score that
       converts it into a value between 0 and 1, where it's 0 if the score
       was exactly 0.5, and increases toward 1 as the score either
       increases toward 1 or decreases toward 0.
     */
     return array('title' => $title, 'url' => $page[self::URL], 'key' => webencode(Classifier::makeKey($page)), 'cache_link' => $cache_link, 'description' => $description, 'score' => $score, 'positive' => $score >= 0.5 ? 1 : 0, 'confidence' => abs($score - 0.5) / 0.5, 'disagreement' => $disagreement);
 }
Example #10
0
 /**
  * Handles the request to get the  array of news feed sources which hash to
  * a particular value i.e. match with the index of requesting machine's
  * hashed url/name from array of available machines hash
  */
 function getNewsSources()
 {
     if (!isset($_REQUEST["arg"])) {
         return;
     }
     $source_model = $this->model("source");
     $current_machine = $this->clean(webdecode($_REQUEST["arg"]), "string");
     $machine_hashes = $source_model->getMachineHashUrls();
     $machine_index_match = array_search($current_machine, $machine_hashes);
     if ($machine_index_match === false) {
         echo webencode(serialize(array()));
         return;
     }
     $num_machines = count($machine_hashes);
     $pre_feeds = $source_model->getMediaSources("rss");
     $pre_feeds = array_merge($pre_feeds, $source_model->getMediaSources("html"));
     if (!$pre_feeds) {
         return false;
     }
     $feeds = array();
     foreach ($pre_feeds as $pre_feed) {
         if (!isset($pre_feed['NAME'])) {
             continue;
         }
         $hash_int = unpack("N", crawlHash($pre_feed['NAME']));
         if (!isset($hash_int[1])) {
             continue;
         }
         $hash_index = $hash_int[1] % $num_machines;
         if ($machine_index_match != $hash_index) {
             continue;
         }
         if ($pre_feed['TYPE'] == 'html') {
             list($pre_feed['CHANNEL_PATH'], $pre_feed['ITEM_PATH'], $pre_feed['TITLE_PATH'], $pre_feed['DESCRIPTION_PATH'], $pre_feed['LINK_PATH']) = explode("###", html_entity_decode($pre_feed['AUX_INFO']));
         }
         $feeds[] = $pre_feed;
     }
     echo webencode(serialize($feeds));
 }
Example #11
0
 /**
  * Given a list of class labels, returns an array mapping each class label
  * to an array of data necessary for initializing a classifier for that
  * label. This static method is used to prepare a collection of classifiers
  * for distribution to fetchers, so that each fetcher can classify pages as
  * it downloads them. The only extra properties passed along in addition to
  * the base classification data are the final features and final algorithm,
  * both necessary for classifying new documents.
  *
  * @param array $labels flat array of class labels for which to load data
  * @return array associative array mapping class labels to arrays of data
  * necessary for initializing the associated classifier
  */
 static function loadClassifiersData($labels)
 {
     $fields = array('classifier', 'final_features', 'final_algorithm');
     $classifiers_data = array();
     foreach ($labels as $label) {
         $basedir = WORK_DIRECTORY . "/classifiers/{$label}";
         $classifier_data = array();
         foreach ($fields as $field) {
             $filename = "{$basedir}/{$field}.txt";
             if (file_exists($filename)) {
                 /*
                   The data is web-encoded because it will be sent in an
                   HTTP response to each fetcher as it prepares for a new
                   crawl.
                 */
                 $classifier_data[$field] = webencode(file_get_contents($filename));
             } else {
                 $classifier_data = false;
                 break;
             }
         }
         $classifiers_data[$label] = $classifier_data;
     }
     return $classifiers_data;
 }