Esempio n. 1
0
 /**
  * Sets parameters for fetching based on provided info struct
  * ($info typically would come from the queue server)
  *
  * @param array& $info struct with info about the kind of crawl, timestamp
  * of index, crawl order, etc.
  */
 function setCrawlParamsFromArray(&$info)
 {
     /* QUEUE_SERVERS and CURRENT_SERVER might not be set if info came
           from a queue_server rather than from name server
        */
     if (isset($info[self::QUEUE_SERVERS])) {
         $this->queue_servers = $info[self::QUEUE_SERVERS];
     } else {
         $info[self::QUEUE_SERVERS] = $this->queue_servers;
     }
     if (isset($info[self::CURRENT_SERVER])) {
         $this->current_server = $info[self::CURRENT_SERVER];
     } else {
         $info[self::CURRENT_SERVER] = $this->current_server;
     }
     $update_fields = array(self::ALLOWED_SITES => 'allowed_sites', self::CACHE_PAGES => 'cache_pages', self::CRAWL_INDEX => "crawl_index", self::CRAWL_ORDER => 'crawl_order', self::CRAWL_TYPE => "crawl_type", self::DISALLOWED_SITES => 'disallowed_sites', self::INDEXED_FILE_TYPES => 'indexed_file_types', self::PROXY_SERVERS => 'proxy_servers', self::RESTRICT_SITES_BY_URL => 'restrict_sites_by_url', self::SUMMARIZER_OPTION => "summarizer_option", self::TOR_PROXY => 'tor_proxy');
     $check_cull_fields = array("restrict_sites_by_url", "allowed_sites", "disallowed_sites");
     $cull_now_non_crawlable = false;
     foreach ($update_fields as $info_field => $field) {
         if (isset($info[$info_field])) {
             if (in_array($info_field, $check_cull_fields) && (!isset($this->{$field}) || $this->{$field} != $info[$info_field])) {
                 $cull_now_non_crawlable = true;
             }
             $this->{$field} = $info[$info_field];
         }
     }
     if ($cull_now_non_crawlable) {
         crawlLog("Allowed/Disallowed Urls have changed");
         crawlLog("Checking if urls in to crawl lists need to be culled");
         $this->cullNoncrawlableSites();
     }
     if (!empty($info[self::ACTIVE_CLASSIFIERS_DATA])) {
         $this->active_classifiers = isset($info[self::ACTIVE_CLASSIFIERS]) && is_array($info[self::ACTIVE_CLASSIFIERS]) ? $info[self::ACTIVE_CLASSIFIERS] : array();
         $this->active_rankers = isset($info[self::ACTIVE_RANKERS]) && is_array($info[self::ACTIVE_RANKERS]) ? $info[self::ACTIVE_RANKERS] : array();
         /*
           The classifier data is set by the fetch controller for each
           active classifier, and is a compressed, serialized structure
           containing all of the objects needed for classification.
         */
         $classifiers_data = $info[self::ACTIVE_CLASSIFIERS_DATA];
         $this->classifiers = array();
         foreach ($classifiers_data as $label => $classifier_data) {
             if ($classifier_data) {
                 $classifier = Classifier::newClassifierFromData($classifier_data);
                 $this->classifiers[] = $classifier;
                 crawlLog("Loading '{$label}' classifier/ranker.");
                 if (in_array($label, $this->active_classifiers)) {
                     crawlLog("  Using '{$label}' as a classifier.");
                 }
                 if (in_array($label, $this->active_rankers)) {
                     crawlLog("  Using '{$label}' as a ranker.");
                 }
             } else {
                 crawlLog("Skipping classifier '{$label}'; missing " . "finalized data.");
             }
         }
     }
     if (isset($info[self::PAGE_RULES])) {
         $rule_string = implode("\n", $info[self::PAGE_RULES]);
         $rule_string = html_entity_decode($rule_string, ENT_QUOTES);
         $this->page_rule_parser = new PageRuleParser($rule_string);
     }
     if (isset($info[self::VIDEO_SOURCES])) {
         $this->video_sources = $info[self::VIDEO_SOURCES];
     }
     if (isset($info[self::INDEXING_PLUGINS])) {
         $this->plugin_processors = array();
         foreach ($info[self::INDEXING_PLUGINS] as $plugin) {
             $plugin_name = $plugin . "Plugin";
             $processors = $plugin_name::getProcessors();
             $plugin_object = new $plugin_name();
             if (method_exists($plugin_name, "setConfiguration") && isset($info[self::INDEXING_PLUGINS_DATA][$plugin])) {
                 $plugin_object->setConfiguration($info[self::INDEXING_PLUGINS_DATA][$plugin]);
             }
             foreach ($processors as $processor) {
                 $this->plugin_processors[$processor][$plugin_name] = $plugin_object;
             }
         }
         foreach ($this->indexed_file_types as $file_type) {
             $processor = ucfirst($file_type) . "Processor";
             if (!class_exists($processor)) {
                 continue;
             }
             if (!isset($this->plugin_processors[$processor])) {
                 $this->plugin_processors[$processor] = array();
             }
             $parent_processor = $processor;
             while (($parent_processor = get_parent_class($parent_processor)) && $parent_processor != "PageProcessor") {
                 if (isset($this->plugin_processors[$parent_processor])) {
                     $this->plugin_processors[$processor] += $this->plugin_processors[$parent_processor];
                 }
             }
         }
         foreach ($this->plugin_processors as $processor => $plugins) {
             $this->plugin_processors[$processor] = array_values($plugins);
         }
     }
     if (isset($info[self::POST_MAX_SIZE]) && ($this->post_max_size > $info[self::POST_MAX_SIZE] || !$this->post_max_size)) {
         $this->post_max_size = $info[self::POST_MAX_SIZE];
     }
     if (isset($info[self::SCHEDULE_TIME])) {
         $this->schedule_time = $info[self::SCHEDULE_TIME];
     }
     if (isset($info[self::PAGE_RANGE_REQUEST])) {
         $this->page_range_request = $info[self::PAGE_RANGE_REQUEST];
     }
     if (isset($info[self::MAX_DESCRIPTION_LEN])) {
         $this->max_description_len = $info[self::MAX_DESCRIPTION_LEN];
     }
 }