Пример #1
0
 /**
  * Checks to see if the parameters by which the active crawl are being
  * conducted have been modified since the last time the values were put
  * into queue server field variables. If so, it updates the values to
  * to their new values
  */
 function checkUpdateCrawlParameters()
 {
     crawlLog("Check for update in crawl parameters...");
     $dir = CRAWL_DIR . '/cache/' . self::index_data_base_name . $this->crawl_time;
     $modified_time = IndexArchiveBundle::getParamModifiedTime($dir);
     if ($this->archive_modified_time == $modified_time) {
         crawlLog("...none.");
         return;
     }
     $updatable_info = array("page_range_request" => self::PAGE_RANGE_REQUEST, "max_description_len" => self::MAX_DESCRIPTION_LEN, "page_recrawl_frequency" => self::PAGE_RECRAWL_FREQUENCY, "restrict_sites_by_url" => self::RESTRICT_SITES_BY_URL, "cache_pages" => self::CACHE_PAGES, "allowed_sites" => self::ALLOWED_SITES, "disallowed_sites" => self::DISALLOWED_SITES, "page_rules" => self::PAGE_RULES, "indexed_file_types" => self::INDEXED_FILE_TYPES, "indexing_plugins" => self::INDEXING_PLUGINS, "indexing_plugins_data" => self::INDEXING_PLUGINS_DATA, "video_sources" => self::VIDEO_SOURCES);
     $keys = array_keys($updatable_info);
     $archive_info = IndexArchiveBundle::getArchiveInfo($dir);
     $index_info = unserialize($archive_info['DESCRIPTION']);
     $check_cull_fields = array("restrict_sites_by_url", "allowed_sites", "disallowed_sites");
     $cull_now_non_crawlable = false;
     foreach ($keys as $index_field) {
         if (isset($index_info[$updatable_info[$index_field]])) {
             if ($index_field == "disallowed_sites") {
                 $update_disallow = true;
             }
             if (in_array($index_field, $check_cull_fields) && (!isset($this->{$index_field}) || $this->{$index_field} != $index_info[$updatable_info[$index_field]])) {
                 $cull_now_non_crawlable = true;
             }
             $this->{$index_field} = $index_info[$updatable_info[$index_field]];
             if ($this->isOnlyScheduler()) {
                 crawlLog("Scheduler Updating ...{$index_field}.");
             } else {
                 crawlLog("Updating ...{$index_field}.");
             }
         }
     }
     /* We now do further processing or disallowed sites to see if any
          of them are really quota sites
        */
     if ($update_disallow == true) {
         $this->updateDisallowedQuotaSites();
     }
     if ($this->isAScheduler() && $cull_now_non_crawlable) {
         crawlLog("Scheduler: Allowed/Disallowed Urls have changed");
         crawlLog("Scheduler: Checking if urls in queue need to be culled");
         $this->cullNoncrawlableSites();
     }
     $this->archive_modified_time = $modified_time;
 }