/** * Main runtime loop of the queue_server. * * Loops until a stop message received, check for start, stop, resume * crawl messages, deletes any WebQueueBundle for which an * IndexArchiveBundle does not exist. Processes */ function loop() { $info[self::STATUS] = self::WAITING_START_MESSAGE_STATE; crawlLog("In queue loop!! {$this->server_name}", "queue_server"); if ($this->isAIndexer()) { $this->deleteOrphanedBundles(); } while (CrawlDaemon::processHandler()) { crawlLog("{$this->server_name} peak memory usage so far: " . memory_get_peak_usage() . "!!"); $info = $this->handleAdminMessages($info); if ($info[self::STATUS] == self::WAITING_START_MESSAGE_STATE) { crawlLog("{$this->server_name} is waiting for start message\n"); sleep(QUEUE_SLEEP_TIME); continue; } if ($info[self::STATUS] == self::STOP_STATE) { continue; } crawlLog("{$this->server_name} active crawl is " . "{$this->crawl_time}."); if ($this->isAScheduler()) { crawlLog("Current queue size is:" . $this->web_queue->to_crawl_queue->count); } $start_loop_time = time(); //check and update if necessary the crawl params of current crawl $this->checkUpdateCrawlParameters(); $this->updateMostRecentFetcher(); $this->processCrawlData(); $time_diff = time() - $start_loop_time; if ($time_diff < QUEUE_SLEEP_TIME) { crawlLog("Sleeping..."); sleep(QUEUE_SLEEP_TIME - $time_diff); } } crawlLog("{$this->server_name} shutting down!!"); }
/** * Main loop for the mirror script. * */ function loop() { crawlLog("In Sync Loop"); $info[self::STATUS] = self::CONTINUE_STATE; while (CrawlDaemon::processHandler()) { $syncer_message_file = CRAWL_DIR . "/schedules/mirror_messages.txt"; if (file_exists($syncer_message_file)) { $info = unserialize(file_get_contents($syncer_message_file)); unlink($syncer_message_file); if (isset($info[self::STATUS]) && $info[self::STATUS] == self::STOP_STATE) { continue; } } $info = $this->checkScheduler(); if ($info === false) { crawlLog("Cannot connect to queue server..." . " will try again in " . MIRROR_NOTIFY_FREQUENCY . " seconds."); sleep(MIRROR_NOTIFY_FREQUENCY); continue; } if ($info[self::STATUS] == self::NO_DATA_STATE) { crawlLog("No data from queue server. Sleeping..."); sleep(MIRROR_SYNC_FREQUENCY); continue; } $this->copyNextSyncFile(); } //end while crawlLog("Mirror shutting down!!"); }
/** * Main loop for the fetcher. * * Checks for stop message, checks queue server if crawl has changed and * for new pages to crawl. Loop gets a group of next pages to crawl if * there are pages left to crawl (otherwise sleep 5 seconds). It downloads * these pages, deduplicates them, and updates the found site info with the * result before looping again. */ function loop() { crawlLog("In Fetch Loop"); $prefix = $this->fetcher_num . "-"; if (!file_exists(CRAWL_DIR . "/{$prefix}temp")) { mkdir(CRAWL_DIR . "/{$prefix}temp"); } $info[self::STATUS] = self::CONTINUE_STATE; $local_archives = array(""); while (CrawlDaemon::processHandler()) { $start_time = microtime(); $fetcher_message_file = CRAWL_DIR . "/schedules/{$prefix}fetcher_messages.txt"; if (file_exists($fetcher_message_file)) { $info = unserialize(file_get_contents($fetcher_message_file)); unlink($fetcher_message_file); if (isset($info[self::STATUS]) && $info[self::STATUS] == self::STOP_STATE) { continue; } } $switch_fetch_or_no_current = $this->checkCrawlTime(); if ($switch_fetch_or_no_current) { /* case(1) */ crawlLog("MAIN LOOP CASE 1 --" . " SWITCH CRAWL OR NO CURRENT CRAWL"); $info[self::CRAWL_TIME] = $this->crawl_time; if ($info[self::CRAWL_TIME] == 0) { $info[self::STATUS] = self::NO_DATA_STATE; $this->to_crawl = array(); } } else { if ($this->crawl_type == self::ARCHIVE_CRAWL && $this->arc_type != "WebArchiveBundle" && $this->arc_type != "") { /* case(2) */ // An archive crawl with data coming from the name server. crawlLog("MAIN LOOP CASE 2 -- ARCHIVE SCHEDULER (NOT RECRAWL)"); $info = $this->checkArchiveScheduler(); if ($info === false) { crawlLog("No Archive Schedule Data..." . " will try again in " . FETCH_SLEEP_TIME . " seconds."); sleep(FETCH_SLEEP_TIME); continue; } } else { if ($this->crawl_time > 0) { /* case(3) */ // Either a web crawl or a recrawl of a previous web crawl. if ($this->crawl_type == self::ARCHIVE_CRAWL) { crawlLog("MAIN LOOP CASE 3 -- RECRAWL SCHEDULER"); } else { crawlLog("MAIN LOOP CASE 4 -- WEB SCHEDULER"); } $info = $this->checkScheduler(); if ($info === false) { crawlLog("Cannot connect to name server..." . " will try again in " . FETCH_SLEEP_TIME . " seconds."); sleep(FETCH_SLEEP_TIME); continue; } } else { crawlLog("MAIN LOOP CASE 5 -- NO CURRENT CRAWL"); $info[self::STATUS] = self::NO_DATA_STATE; } } } /* case(2), case(3) might have set info without $info[self::STATUS] being set */ if (!isset($info[self::STATUS])) { if ($info === true) { $info = array(); } $info[self::STATUS] = self::CONTINUE_STATE; } if ($info[self::STATUS] == self::NO_DATA_STATE) { crawlLog("No data. Sleeping..."); sleep(FETCH_SLEEP_TIME); continue; } $tmp_base_name = isset($info[self::CRAWL_TIME]) ? CRAWL_DIR . "/cache/{$prefix}" . self::archive_base_name . $info[self::CRAWL_TIME] : ""; if (isset($info[self::CRAWL_TIME]) && ($this->web_archive == NULL || $this->web_archive->dir_name != $tmp_base_name)) { if (isset($this->web_archive->dir_name)) { crawlLog("Old name: " . $this->web_archive->dir_name); } if (is_object($this->web_archive)) { $this->web_archive = NULL; } $this->to_crawl_again = array(); $this->found_sites = array(); gc_collect_cycles(); $this->web_archive = new WebArchiveBundle($tmp_base_name, false); $this->crawl_time = $info[self::CRAWL_TIME]; $this->sum_seen_title_length = 0; $this->sum_seen_description_length = 0; $this->sum_seen_site_link_length = 0; $this->num_seen_sites = 0; crawlLog("New name: " . $this->web_archive->dir_name); crawlLog("Switching archive..."); if (!isset($info[self::ARC_DATA])) { continue; } } switch ($this->crawl_type) { case self::WEB_CRAWL: $downloaded_pages = $this->downloadPagesWebCrawl(); break; case self::ARCHIVE_CRAWL: if (isset($info[self::ARC_DATA])) { $downloaded_pages = $info[self::ARC_DATA]; } else { $downloaded_pages = $this->downloadPagesArchiveCrawl(); } break; } if (isset($downloaded_pages["NO_PROCESS"])) { unset($downloaded_pages["NO_PROCESS"]); $summarized_site_pages = array_values($downloaded_pages); $this->no_process_links = true; } else { $summarized_site_pages = $this->processFetchPages($downloaded_pages); $this->no_process_links = false; } crawlLog("Number of summarized pages " . count($summarized_site_pages)); $force_send = isset($info[self::END_ITERATOR]) && $info[self::END_ITERATOR] ? true : false; $this->updateFoundSites($summarized_site_pages, $force_send); $sleep_time = max(0, ceil(MINIMUM_FETCH_LOOP_TIME - changeInMicrotime($start_time))); if ($sleep_time > 0) { crawlLog("Ensure minimum loop time by sleeping..." . $sleep_time); sleep($sleep_time); } } //end while crawlLog("Fetcher shutting down!!"); }
/** * Main loop for the news updater. */ function loop() { crawlLog("In News Update Loop"); $info[self::STATUS] = self::CONTINUE_STATE; $local_archives = array(""); while (CrawlDaemon::processHandler()) { $start_time = microtime(); crawlLog("Checking if news feeds should be updated..."); $this->newsUpdate(); $sleep_time = max(0, ceil(MINIMUM_UPDATE_LOOP_TIME - changeInMicrotime($start_time))); if ($sleep_time > 0) { crawlLog("Ensure minimum loop time by sleeping..." . $sleep_time); sleep($sleep_time); } } //end while crawlLog("News Updater shutting down!!"); }