/** * Sets parameters for fetching based on provided info struct * ($info typically would come from the queue server) * * @param array& $info struct with info about the kind of crawl, timestamp * of index, crawl order, etc. */ function setCrawlParamsFromArray(&$info) { /* QUEUE_SERVERS and CURRENT_SERVER might not be set if info came from a queue_server rather than from name server */ if (isset($info[self::QUEUE_SERVERS])) { $this->queue_servers = $info[self::QUEUE_SERVERS]; } else { $info[self::QUEUE_SERVERS] = $this->queue_servers; } if (isset($info[self::CURRENT_SERVER])) { $this->current_server = $info[self::CURRENT_SERVER]; } else { $info[self::CURRENT_SERVER] = $this->current_server; } $update_fields = array(self::ALLOWED_SITES => 'allowed_sites', self::CACHE_PAGES => 'cache_pages', self::CRAWL_INDEX => "crawl_index", self::CRAWL_ORDER => 'crawl_order', self::CRAWL_TYPE => "crawl_type", self::DISALLOWED_SITES => 'disallowed_sites', self::INDEXED_FILE_TYPES => 'indexed_file_types', self::PROXY_SERVERS => 'proxy_servers', self::RESTRICT_SITES_BY_URL => 'restrict_sites_by_url', self::SUMMARIZER_OPTION => "summarizer_option", self::TOR_PROXY => 'tor_proxy'); $check_cull_fields = array("restrict_sites_by_url", "allowed_sites", "disallowed_sites"); $cull_now_non_crawlable = false; foreach ($update_fields as $info_field => $field) { if (isset($info[$info_field])) { if (in_array($info_field, $check_cull_fields) && (!isset($this->{$field}) || $this->{$field} != $info[$info_field])) { $cull_now_non_crawlable = true; } $this->{$field} = $info[$info_field]; } } if ($cull_now_non_crawlable) { crawlLog("Allowed/Disallowed Urls have changed"); crawlLog("Checking if urls in to crawl lists need to be culled"); $this->cullNoncrawlableSites(); } if (!empty($info[self::ACTIVE_CLASSIFIERS_DATA])) { $this->active_classifiers = isset($info[self::ACTIVE_CLASSIFIERS]) && is_array($info[self::ACTIVE_CLASSIFIERS]) ? $info[self::ACTIVE_CLASSIFIERS] : array(); $this->active_rankers = isset($info[self::ACTIVE_RANKERS]) && is_array($info[self::ACTIVE_RANKERS]) ? $info[self::ACTIVE_RANKERS] : array(); /* The classifier data is set by the fetch controller for each active classifier, and is a compressed, serialized structure containing all of the objects needed for classification. */ $classifiers_data = $info[self::ACTIVE_CLASSIFIERS_DATA]; $this->classifiers = array(); foreach ($classifiers_data as $label => $classifier_data) { if ($classifier_data) { $classifier = Classifier::newClassifierFromData($classifier_data); $this->classifiers[] = $classifier; crawlLog("Loading '{$label}' classifier/ranker."); if (in_array($label, $this->active_classifiers)) { crawlLog(" Using '{$label}' as a classifier."); } if (in_array($label, $this->active_rankers)) { crawlLog(" Using '{$label}' as a ranker."); } } else { crawlLog("Skipping classifier '{$label}'; missing " . "finalized data."); } } } if (isset($info[self::PAGE_RULES])) { $rule_string = implode("\n", $info[self::PAGE_RULES]); $rule_string = html_entity_decode($rule_string, ENT_QUOTES); $this->page_rule_parser = new PageRuleParser($rule_string); } if (isset($info[self::VIDEO_SOURCES])) { $this->video_sources = $info[self::VIDEO_SOURCES]; } if (isset($info[self::INDEXING_PLUGINS])) { $this->plugin_processors = array(); foreach ($info[self::INDEXING_PLUGINS] as $plugin) { $plugin_name = $plugin . "Plugin"; $processors = $plugin_name::getProcessors(); $plugin_object = new $plugin_name(); if (method_exists($plugin_name, "setConfiguration") && isset($info[self::INDEXING_PLUGINS_DATA][$plugin])) { $plugin_object->setConfiguration($info[self::INDEXING_PLUGINS_DATA][$plugin]); } foreach ($processors as $processor) { $this->plugin_processors[$processor][$plugin_name] = $plugin_object; } } foreach ($this->indexed_file_types as $file_type) { $processor = ucfirst($file_type) . "Processor"; if (!class_exists($processor)) { continue; } if (!isset($this->plugin_processors[$processor])) { $this->plugin_processors[$processor] = array(); } $parent_processor = $processor; while (($parent_processor = get_parent_class($parent_processor)) && $parent_processor != "PageProcessor") { if (isset($this->plugin_processors[$parent_processor])) { $this->plugin_processors[$processor] += $this->plugin_processors[$parent_processor]; } } } foreach ($this->plugin_processors as $processor => $plugins) { $this->plugin_processors[$processor] = array_values($plugins); } } if (isset($info[self::POST_MAX_SIZE]) && ($this->post_max_size > $info[self::POST_MAX_SIZE] || !$this->post_max_size)) { $this->post_max_size = $info[self::POST_MAX_SIZE]; } if (isset($info[self::SCHEDULE_TIME])) { $this->schedule_time = $info[self::SCHEDULE_TIME]; } if (isset($info[self::PAGE_RANGE_REQUEST])) { $this->page_range_request = $info[self::PAGE_RANGE_REQUEST]; } if (isset($info[self::MAX_DESCRIPTION_LEN])) { $this->max_description_len = $info[self::MAX_DESCRIPTION_LEN]; } }