コード例 #1
0
ファイル: queue_server.php プロジェクト: yakar/yioop
 /**
  * Main runtime loop of the queue_server.
  *
  * Loops until a stop message received, check for start, stop, resume
  * crawl messages, deletes any WebQueueBundle for which an
  * IndexArchiveBundle does not exist. Processes
  */
 function loop()
 {
     $info[self::STATUS] = self::WAITING_START_MESSAGE_STATE;
     crawlLog("In queue loop!! {$this->server_name}", "queue_server");
     if ($this->isAIndexer()) {
         $this->deleteOrphanedBundles();
     }
     while (CrawlDaemon::processHandler()) {
         crawlLog("{$this->server_name} peak memory usage so far: " . memory_get_peak_usage() . "!!");
         $info = $this->handleAdminMessages($info);
         if ($info[self::STATUS] == self::WAITING_START_MESSAGE_STATE) {
             crawlLog("{$this->server_name} is waiting for start message\n");
             sleep(QUEUE_SLEEP_TIME);
             continue;
         }
         if ($info[self::STATUS] == self::STOP_STATE) {
             continue;
         }
         crawlLog("{$this->server_name} active crawl is " . "{$this->crawl_time}.");
         if ($this->isAScheduler()) {
             crawlLog("Current queue size is:" . $this->web_queue->to_crawl_queue->count);
         }
         $start_loop_time = time();
         //check and update if necessary the crawl params of current crawl
         $this->checkUpdateCrawlParameters();
         $this->updateMostRecentFetcher();
         $this->processCrawlData();
         $time_diff = time() - $start_loop_time;
         if ($time_diff < QUEUE_SLEEP_TIME) {
             crawlLog("Sleeping...");
             sleep(QUEUE_SLEEP_TIME - $time_diff);
         }
     }
     crawlLog("{$this->server_name} shutting down!!");
 }
コード例 #2
0
ファイル: mirror.php プロジェクト: yakar/yioop
 /**
  * Main loop for the mirror script.
  *
  */
 function loop()
 {
     crawlLog("In Sync Loop");
     $info[self::STATUS] = self::CONTINUE_STATE;
     while (CrawlDaemon::processHandler()) {
         $syncer_message_file = CRAWL_DIR . "/schedules/mirror_messages.txt";
         if (file_exists($syncer_message_file)) {
             $info = unserialize(file_get_contents($syncer_message_file));
             unlink($syncer_message_file);
             if (isset($info[self::STATUS]) && $info[self::STATUS] == self::STOP_STATE) {
                 continue;
             }
         }
         $info = $this->checkScheduler();
         if ($info === false) {
             crawlLog("Cannot connect to queue server..." . " will try again in " . MIRROR_NOTIFY_FREQUENCY . " seconds.");
             sleep(MIRROR_NOTIFY_FREQUENCY);
             continue;
         }
         if ($info[self::STATUS] == self::NO_DATA_STATE) {
             crawlLog("No data from queue server. Sleeping...");
             sleep(MIRROR_SYNC_FREQUENCY);
             continue;
         }
         $this->copyNextSyncFile();
     }
     //end while
     crawlLog("Mirror shutting down!!");
 }
コード例 #3
0
ファイル: fetcher.php プロジェクト: yakar/yioop
 /**
  * Main loop for the fetcher.
  *
  * Checks for stop message, checks queue server if crawl has changed and
  * for new pages to crawl. Loop gets a group of next pages to crawl if
  * there are pages left to crawl (otherwise sleep 5 seconds). It downloads
  * these pages, deduplicates them, and updates the found site info with the
  * result before looping again.
  */
 function loop()
 {
     crawlLog("In Fetch Loop");
     $prefix = $this->fetcher_num . "-";
     if (!file_exists(CRAWL_DIR . "/{$prefix}temp")) {
         mkdir(CRAWL_DIR . "/{$prefix}temp");
     }
     $info[self::STATUS] = self::CONTINUE_STATE;
     $local_archives = array("");
     while (CrawlDaemon::processHandler()) {
         $start_time = microtime();
         $fetcher_message_file = CRAWL_DIR . "/schedules/{$prefix}fetcher_messages.txt";
         if (file_exists($fetcher_message_file)) {
             $info = unserialize(file_get_contents($fetcher_message_file));
             unlink($fetcher_message_file);
             if (isset($info[self::STATUS]) && $info[self::STATUS] == self::STOP_STATE) {
                 continue;
             }
         }
         $switch_fetch_or_no_current = $this->checkCrawlTime();
         if ($switch_fetch_or_no_current) {
             /* case(1) */
             crawlLog("MAIN LOOP CASE 1 --" . " SWITCH CRAWL OR NO CURRENT CRAWL");
             $info[self::CRAWL_TIME] = $this->crawl_time;
             if ($info[self::CRAWL_TIME] == 0) {
                 $info[self::STATUS] = self::NO_DATA_STATE;
                 $this->to_crawl = array();
             }
         } else {
             if ($this->crawl_type == self::ARCHIVE_CRAWL && $this->arc_type != "WebArchiveBundle" && $this->arc_type != "") {
                 /* case(2) */
                 // An archive crawl with data coming from the name server.
                 crawlLog("MAIN LOOP CASE 2 -- ARCHIVE SCHEDULER (NOT RECRAWL)");
                 $info = $this->checkArchiveScheduler();
                 if ($info === false) {
                     crawlLog("No Archive Schedule Data..." . " will try again in " . FETCH_SLEEP_TIME . " seconds.");
                     sleep(FETCH_SLEEP_TIME);
                     continue;
                 }
             } else {
                 if ($this->crawl_time > 0) {
                     /* case(3) */
                     // Either a web crawl or a recrawl of a previous web crawl.
                     if ($this->crawl_type == self::ARCHIVE_CRAWL) {
                         crawlLog("MAIN LOOP CASE 3 -- RECRAWL SCHEDULER");
                     } else {
                         crawlLog("MAIN LOOP CASE 4 -- WEB SCHEDULER");
                     }
                     $info = $this->checkScheduler();
                     if ($info === false) {
                         crawlLog("Cannot connect to name server..." . " will try again in " . FETCH_SLEEP_TIME . " seconds.");
                         sleep(FETCH_SLEEP_TIME);
                         continue;
                     }
                 } else {
                     crawlLog("MAIN LOOP CASE 5 -- NO CURRENT CRAWL");
                     $info[self::STATUS] = self::NO_DATA_STATE;
                 }
             }
         }
         /* case(2), case(3) might have set info without
              $info[self::STATUS] being set
            */
         if (!isset($info[self::STATUS])) {
             if ($info === true) {
                 $info = array();
             }
             $info[self::STATUS] = self::CONTINUE_STATE;
         }
         if ($info[self::STATUS] == self::NO_DATA_STATE) {
             crawlLog("No data. Sleeping...");
             sleep(FETCH_SLEEP_TIME);
             continue;
         }
         $tmp_base_name = isset($info[self::CRAWL_TIME]) ? CRAWL_DIR . "/cache/{$prefix}" . self::archive_base_name . $info[self::CRAWL_TIME] : "";
         if (isset($info[self::CRAWL_TIME]) && ($this->web_archive == NULL || $this->web_archive->dir_name != $tmp_base_name)) {
             if (isset($this->web_archive->dir_name)) {
                 crawlLog("Old name: " . $this->web_archive->dir_name);
             }
             if (is_object($this->web_archive)) {
                 $this->web_archive = NULL;
             }
             $this->to_crawl_again = array();
             $this->found_sites = array();
             gc_collect_cycles();
             $this->web_archive = new WebArchiveBundle($tmp_base_name, false);
             $this->crawl_time = $info[self::CRAWL_TIME];
             $this->sum_seen_title_length = 0;
             $this->sum_seen_description_length = 0;
             $this->sum_seen_site_link_length = 0;
             $this->num_seen_sites = 0;
             crawlLog("New name: " . $this->web_archive->dir_name);
             crawlLog("Switching archive...");
             if (!isset($info[self::ARC_DATA])) {
                 continue;
             }
         }
         switch ($this->crawl_type) {
             case self::WEB_CRAWL:
                 $downloaded_pages = $this->downloadPagesWebCrawl();
                 break;
             case self::ARCHIVE_CRAWL:
                 if (isset($info[self::ARC_DATA])) {
                     $downloaded_pages = $info[self::ARC_DATA];
                 } else {
                     $downloaded_pages = $this->downloadPagesArchiveCrawl();
                 }
                 break;
         }
         if (isset($downloaded_pages["NO_PROCESS"])) {
             unset($downloaded_pages["NO_PROCESS"]);
             $summarized_site_pages = array_values($downloaded_pages);
             $this->no_process_links = true;
         } else {
             $summarized_site_pages = $this->processFetchPages($downloaded_pages);
             $this->no_process_links = false;
         }
         crawlLog("Number of summarized pages " . count($summarized_site_pages));
         $force_send = isset($info[self::END_ITERATOR]) && $info[self::END_ITERATOR] ? true : false;
         $this->updateFoundSites($summarized_site_pages, $force_send);
         $sleep_time = max(0, ceil(MINIMUM_FETCH_LOOP_TIME - changeInMicrotime($start_time)));
         if ($sleep_time > 0) {
             crawlLog("Ensure minimum loop time by sleeping..." . $sleep_time);
             sleep($sleep_time);
         }
     }
     //end while
     crawlLog("Fetcher shutting down!!");
 }
コード例 #4
0
ファイル: news_updater.php プロジェクト: yakar/yioop
 /**
  * Main loop for the news updater.
  */
 function loop()
 {
     crawlLog("In News Update Loop");
     $info[self::STATUS] = self::CONTINUE_STATE;
     $local_archives = array("");
     while (CrawlDaemon::processHandler()) {
         $start_time = microtime();
         crawlLog("Checking if news feeds should be updated...");
         $this->newsUpdate();
         $sleep_time = max(0, ceil(MINIMUM_UPDATE_LOOP_TIME - changeInMicrotime($start_time)));
         if ($sleep_time > 0) {
             crawlLog("Ensure minimum loop time by sleeping..." . $sleep_time);
             sleep($sleep_time);
         }
     }
     //end while
     crawlLog("News Updater shutting down!!");
 }