/** * Handles admin request related to controlling file options to be used * in a crawl * * This activity allows a user to specify the page range size to be * be used during a crawl as well as which file types can be downloaded */ function pageOptions() { global $INDEXED_FILE_TYPES; /* get processors for different file types (populating $INDEXED_FILE_TYPES) */ foreach (glob(BASE_DIR . "/lib/processors/*_processor.php") as $filename) { require_once $filename; } $parent = $this->parent; $crawl_model = $parent->model("crawl"); $profile_model = $parent->model("profile"); $data["ELEMENT"] = "pageoptions"; $data['SCRIPT'] = ""; $machine_urls = $parent->model("machine")->getQueueServerUrls(); $num_machines = count($machine_urls); if ($num_machines < 1 || $num_machines == 1 && UrlParser::isLocalhostUrl($machine_urls[0])) { $machine_urls = NULL; } $data['available_options'] = array(tl('crawl_component_use_below'), tl('crawl_component_use_defaults')); $crawls = $crawl_model->getCrawlList(false, true, $machine_urls); $data['options_default'] = tl('crawl_component_use_below'); foreach ($crawls as $crawl) { if (strlen($crawl['DESCRIPTION']) > 0) { $data['available_options'][$crawl['CRAWL_TIME']] = $crawl['DESCRIPTION']; } } $seed_info = $crawl_model->getSeedInfo(); $data['RECRAWL_FREQS'] = array(-1 => tl('crawl_component_recrawl_never'), 1 => tl('crawl_component_recrawl_1day'), 2 => tl('crawl_component_recrawl_2day'), 3 => tl('crawl_component_recrawl_3day'), 7 => tl('crawl_component_recrawl_7day'), 14 => tl('crawl_component_recrawl_14day')); $data['SIZE_VALUES'] = array(10000 => 10000, 50000 => 50000, 100000 => 100000, 500000 => 500000, 1000000 => 1000000, 5000000 => 5000000, 10000000 => 10000000); $data['LEN_VALUES'] = array(2000 => 2000, 10000 => 10000, 50000 => 50000, 100000 => 100000, 500000 => 500000, 1000000 => 1000000, 5000000 => 5000000, 10000000 => 10000000); $data['available_summarizers'] = array(self::BASIC_SUMMARIZER => tl('crawl_component_basic'), self::CENTROID_SUMMARIZER => tl('crawl_component_centroid')); if (!isset($seed_info["indexed_file_types"]["extensions"])) { $seed_info["indexed_file_types"]["extensions"] = $INDEXED_FILE_TYPES; } $loaded = false; if (isset($_REQUEST['load_option']) && $_REQUEST['load_option'] > 0) { if ($_REQUEST['load_option'] == 1) { $seed_loaded = $crawl_model->getSeedInfo(true); } else { $timestamp = substr($parent->clean($_REQUEST['load_option'], "int"), 0, TIMESTAMP_LEN); $seed_loaded = $crawl_model->getCrawlSeedInfo($timestamp, $machine_urls); } $copy_options = array("general" => array("page_recrawl_frequency", "page_range_request", "max_description_len", "cache_pages", 'summarizer_option'), "indexed_file_types" => array("extensions"), "indexing_plugins" => array("plugins", "plugins_data")); foreach ($copy_options as $main_option => $sub_options) { foreach ($sub_options as $sub_option) { if (isset($seed_loaded[$main_option][$sub_option])) { $seed_info[$main_option][$sub_option] = $seed_loaded[$main_option][$sub_option]; } } } if (isset($seed_loaded['page_rules'])) { $seed_info['page_rules'] = $seed_loaded['page_rules']; } if (isset($seed_loaded['active_classifiers'])) { $seed_info['active_classifiers'] = $seed_loaded['active_classifiers']; } else { $seed_info['active_classifiers'] = array(); $seed_info['active_classifiers']['label'] = array(); } $loaded = true; } else { $seed_info = $crawl_model->getSeedInfo(); if (isset($_REQUEST["page_recrawl_frequency"]) && in_array($_REQUEST["page_recrawl_frequency"], array_keys($data['RECRAWL_FREQS']))) { $seed_info["general"]["page_recrawl_frequency"] = $_REQUEST["page_recrawl_frequency"]; } if (isset($_REQUEST["page_range_request"]) && in_array($_REQUEST["page_range_request"], $data['SIZE_VALUES'])) { $seed_info["general"]["page_range_request"] = $_REQUEST["page_range_request"]; } if (isset($_REQUEST['summarizer_option']) && in_array($_REQUEST['summarizer_option'], array_keys($data['available_summarizers']))) { $seed_info['general']['summarizer_option'] = $_REQUEST['summarizer_option']; } if (isset($_REQUEST["max_description_len"]) && in_array($_REQUEST["max_description_len"], $data['LEN_VALUES'])) { $seed_info["general"]["max_description_len"] = $_REQUEST["max_description_len"]; } if (isset($_REQUEST["cache_pages"])) { $seed_info["general"]["cache_pages"] = true; } else { if (isset($_REQUEST['posted'])) { //form sent but check box unchecked $seed_info["general"]["cache_pages"] = false; } } if (isset($_REQUEST['page_rules'])) { $seed_info['page_rules']['rule'] = $parent->convertStringCleanArray($_REQUEST['page_rules'], 'rule'); } } if (!isset($seed_info["general"]["page_recrawl_frequency"])) { $seed_info["general"]["page_recrawl_frequency"] = PAGE_RECRAWL_FREQUENCY; } $data['summarizer_option'] = $seed_info['general']['summarizer_option']; $data['PAGE_RECRAWL_FREQUENCY'] = $seed_info["general"]["page_recrawl_frequency"]; if (!isset($seed_info["general"]["cache_pages"])) { $seed_info["general"]["cache_pages"] = false; } $data["CACHE_PAGES"] = $seed_info["general"]["cache_pages"]; if (!isset($seed_info["general"]["page_range_request"])) { $seed_info["general"]["page_range_request"] = PAGE_RANGE_REQUEST; } $data['PAGE_SIZE'] = $seed_info["general"]["page_range_request"]; if (!isset($seed_info["general"]["max_description_len"])) { $seed_info["general"]["max_description_len"] = MAX_DESCRIPTION_LEN; } $data['MAX_LEN'] = $seed_info["general"]["max_description_len"]; $data['INDEXING_PLUGINS'] = array(); $included_plugins = array(); if (isset($_REQUEST["posted"]) && !$loaded) { $seed_info['indexing_plugins']['plugins'] = isset($_REQUEST["INDEXING_PLUGINS"]) ? $_REQUEST["INDEXING_PLUGINS"] : array(); } $included_plugins = isset($seed_info['indexing_plugins']['plugins']) ? $seed_info['indexing_plugins']['plugins'] : array(); foreach ($parent->indexing_plugins as $plugin) { $plugin_name = ucfirst($plugin); $data['INDEXING_PLUGINS'][$plugin_name]['checked'] = in_array($plugin_name, $included_plugins) ? "checked='checked'" : ""; /* to use method_exists we want that the require_once for the plugin class has occurred so we instantiate the object via the plugin method call which will also do the require if needed. */ $plugin_object = $parent->plugin(lcfirst($plugin_name)); $class_name = $plugin_name . "Plugin"; if ($loaded && method_exists($class_name, 'setConfiguration') && method_exists($class_name, 'loadDefaultConfiguration')) { if (isset($seed_info['indexing_plugins']['plugins_data'][$plugin_name])) { $plugin_object->setConfiguration($seed_info['indexing_plugins']['plugins_data'][$plugin_name]); } else { $plugin_object->loadDefaultConfiguration(); } $plugin_object->saveConfiguration(); } if (method_exists($class_name, 'configureHandler') && method_exists($class_name, 'configureView')) { $data['INDEXING_PLUGINS'][$plugin_name]['configure'] = true; $plugin_object->configureHandler($data); } else { $data['INDEXING_PLUGINS'][$plugin_name]['configure'] = false; } } $profile = $profile_model->getProfile(WORK_DIRECTORY); if (!isset($_REQUEST['load_option'])) { $data = array_merge($data, $profile); } else { $parent->updateProfileFields($data, $profile, array('IP_LINK', 'CACHE_LINK', 'SIMILAR_LINK', 'IN_LINK', 'SIGNIN_LINK', 'SUBSEARCH_LINK', 'WORD_SUGGEST')); } $weights = array('TITLE_WEIGHT' => 4, 'DESCRIPTION_WEIGHT' => 1, 'LINK_WEIGHT' => 2, 'MIN_RESULTS_TO_GROUP' => 200, 'SERVER_ALPHA' => 1.6); $change = false; foreach ($weights as $weight => $value) { if (isset($_REQUEST[$weight])) { $data[$weight] = $parent->clean($_REQUEST[$weight], 'float', 1); $profile[$weight] = $data[$weight]; $change = true; } else { if (isset($profile[$weight]) && $profile[$weight] != "") { $data[$weight] = $profile[$weight]; } else { $data[$weight] = $value; $profile[$weight] = $data[$weight]; $change = true; } } } if ($change == true) { $profile_model->updateProfile(WORK_DIRECTORY, array(), $profile); } $data['INDEXED_FILE_TYPES'] = array(); $filetypes = array(); foreach ($INDEXED_FILE_TYPES as $filetype) { $ison = false; if (isset($_REQUEST["filetype"]) && !$loaded) { if (isset($_REQUEST["filetype"][$filetype])) { $filetypes[] = $filetype; $ison = true; $change = true; } } else { if (in_array($filetype, $seed_info["indexed_file_types"]["extensions"])) { $filetypes[] = $filetype; $ison = true; } } $data['INDEXED_FILE_TYPES'][$filetype] = $ison ? "checked='checked'" : ''; } $seed_info["indexed_file_types"]["extensions"] = $filetypes; $data['CLASSIFIERS'] = array(); $data['RANKERS'] = array(); $active_classifiers = array(); $active_rankers = array(); foreach (Classifier::getClassifierList() as $classifier) { $label = $classifier->class_label; $ison = false; if (isset($_REQUEST['classifier']) && !$loaded) { if (isset($_REQUEST['classifier'][$label])) { $ison = true; } } else { if ($loaded || !isset($_REQUEST['posted']) && isset($seed_info['active_classifiers']['label'])) { if (in_array($label, $seed_info['active_classifiers']['label'])) { $ison = true; } } } if ($ison) { $data['CLASSIFIERS'][$label] = 'checked="checked"'; $active_classifiers[] = $label; } else { $data['CLASSIFIERS'][$label] = ''; } $ison = false; if (isset($_REQUEST['ranker']) && !$loaded) { if (isset($_REQUEST['ranker'][$label])) { $ison = true; } } else { if ($loaded || !isset($_REQUEST['posted']) && isset($seed_info['active_rankers']['label'])) { if (isset($seed_info['active_rankers']['label']) && in_array($label, $seed_info['active_rankers']['label'])) { $ison = true; } } } if ($ison) { $data['RANKERS'][$label] = 'checked="checked"'; $active_rankers[] = $label; } else { $data['RANKERS'][$label] = ''; } } $parent->pagingLogic($data, 'CLASSIFIERS', 'CLASSIFIERS', DEFAULT_ADMIN_PAGING_NUM / 5, array(), "", array('name' => 'class_label')); $seed_info['active_classifiers']['label'] = $active_classifiers; $seed_info['active_rankers']['label'] = $active_rankers; if (isset($seed_info['page_rules']['rule'])) { if (isset($seed_info['page_rules']['rule']['rule'])) { $data['page_rules'] = $parent->convertArrayLines($seed_info['page_rules']['rule']['rule']); } else { $data['page_rules'] = $parent->convertArrayLines($seed_info['page_rules']['rule']); } } else { $data['page_rules'] = ""; } $allowed_options = array('crawl_time', 'search_time', 'test_options'); if (isset($_REQUEST['option_type']) && in_array($_REQUEST['option_type'], $allowed_options)) { $data['option_type'] = $_REQUEST['option_type']; } else { $data['option_type'] = 'crawl_time'; } if ($data['option_type'] == 'crawl_time') { $data['crawl_time_active'] = "active"; $data['search_time_active'] = ""; $data['test_options_active'] = ""; $data['SCRIPT'] .= "\nswitchTab('crawltimetab'," . "'searchtimetab', 'testoptionstab')\n"; } else { if ($data['option_type'] == 'search_time') { $data['search_time_active'] = "active"; $data['crawl_time_active'] = ""; $data['test_options_active'] = ""; $data['SCRIPT'] .= "\nswitchTab('searchtimetab'," . "'crawltimetab', 'testoptionstab')\n"; } else { $data['search_time_active'] = ""; $data['crawl_time_active'] = ""; $data['test_options_active'] = "active"; $data['SCRIPT'] .= "\nswitchTab('testoptionstab'," . "'crawltimetab', 'searchtimetab');\n"; } } $crawl_model->setSeedInfo($seed_info); if ($change == true && $data['option_type'] != 'test_options') { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_page_options_updated') . "</h1>')"; } $test_processors = array("text/html" => "html", "text/asp" => "html", "text/xml" => "xml", "text/robot" => "robot", "application/xml" => "xml", "application/xhtml+xml" => "html", "application/rss+xml" => "rss", "application/atom+xml" => "rss", "text/csv" => "text", "text/gopher" => "gopher", "text/plain" => "text", "text/rtf" => "rtf", "text/tab-separated-values" => "text"); $data['MIME_TYPES'] = array_keys($test_processors); $data['page_type'] = "text/html"; if (isset($_REQUEST['page_type']) && in_array($_REQUEST['page_type'], $data['MIME_TYPES'])) { $data['page_type'] = $_REQUEST['page_type']; } $data['TESTPAGE'] = isset($_REQUEST['TESTPAGE']) ? $parent->clean($_REQUEST['TESTPAGE'], 'string') : ""; if ($data['option_type'] == 'test_options' && $data['TESTPAGE'] != "") { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_page_options_running_tests') . "</h1>')"; $site = array(); $site[self::ENCODING] = "UTF-8"; $site[self::URL] = "http://test-site.yioop.com/"; $site[self::IP_ADDRESSES] = array("1.1.1.1"); $site[self::HTTP_CODE] = 200; $site[self::MODIFIED] = date("U", time()); $site[self::TIMESTAMP] = time(); $site[self::TYPE] = "text/html"; $site[self::HEADER] = "page options test extractor"; $site[self::SERVER] = "unknown"; $site[self::SERVER_VERSION] = "unknown"; $site[self::OPERATING_SYSTEM] = "unknown"; $site[self::LANG] = 'en'; $site[self::JUST_METAS] = false; if (isset($_REQUEST['page_type']) && in_array($_REQUEST['page_type'], $data['MIME_TYPES'])) { $site[self::TYPE] = $_REQUEST['page_type']; } if ($site[self::TYPE] == 'text/html') { $site[self::ENCODING] = guessEncodingHtml($_REQUEST['TESTPAGE']); } $prefix_name = $test_processors[$site[self::TYPE]]; $processor_name = ucfirst($prefix_name) . "Processor"; $plugin_processors = array(); if (isset($seed_info['indexing_plugins']['plugins'])) { foreach ($seed_info['indexing_plugins']['plugins'] as $plugin) { $plugin_name = $plugin . "Plugin"; $supported_processors = $plugin_name::getProcessors(); foreach ($supported_processors as $supported_processor) { $parent_processor = $processor_name; do { if ($supported_processor == $parent_processor) { $plugin_object = $parent->plugin(lcfirst($plugin)); if (method_exists($plugin_name, "loadConfiguration")) { $plugin_object->loadConfiguration(); } $plugin_processors[] = $plugin_object; break; } } while (($parent_processor = get_parent_class($parent_processor)) && $parent_processor != "PageProcessor"); } } } $page_processor = new $processor_name($plugin_processors, $seed_info["general"]["max_description_len"], $seed_info["general"]["summarizer_option"]); restore_error_handler(); $data["PAGE_RANGE_REQUEST"] = $seed_info["general"]["page_range_request"]; $doc_info = $page_processor->handle(substr($_REQUEST['TESTPAGE'], 0, $data["PAGE_RANGE_REQUEST"]), $site[self::URL]); set_error_handler("yioop_error_handler"); if (!$doc_info) { $data["AFTER_PAGE_PROCESS"] = ""; $data["AFTER_RULE_PROCESS"] = ""; $data["EXTRACTED_WORDS"] = ""; $data["EXTRACTED_META_WORDS"] = ""; return $data; } if ($processor_name != "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) { $doc_info[self::LINKS] = UrlParser::pruneLinks($doc_info[self::LINKS]); } foreach ($doc_info as $key => $value) { $site[$key] = $value; } if (isset($site[self::PAGE])) { unset($site[self::PAGE]); } if (isset($site[self::ROBOT_PATHS])) { $site[self::JUST_METAS] = true; } $reflect = new ReflectionClass("CrawlConstants"); $crawl_constants = $reflect->getConstants(); $crawl_keys = array_keys($crawl_constants); $crawl_values = array_values($crawl_constants); $inverse_constants = array_combine($crawl_values, $crawl_keys); $after_process = array(); foreach ($site as $key => $value) { $out_key = isset($inverse_constants[$key]) ? $inverse_constants[$key] : $key; $after_process[$out_key] = $value; } $data["AFTER_PAGE_PROCESS"] = wordwrap($parent->clean(print_r($after_process, true), "string"), 75, "\n", true); $rule_string = implode("\n", $seed_info['page_rules']['rule']); $rule_string = html_entity_decode($rule_string, ENT_QUOTES); $page_rule_parser = new PageRuleParser($rule_string); $page_rule_parser->executeRuleTrees($site); $after_process = array(); foreach ($site as $key => $value) { $out_key = isset($inverse_constants[$key]) ? $inverse_constants[$key] : $key; $after_process[$out_key] = $value; } $data["AFTER_RULE_PROCESS"] = wordwrap($parent->clean(print_r($after_process, true), "string"), 75, "\n", true); $lang = NULL; if (isset($site[self::LANG])) { $lang = $site[self::LANG]; } $meta_ids = PhraseParser::calculateMetas($site); if (!$site[self::JUST_METAS]) { $host_words = UrlParser::getWordsIfHostUrl($site[self::URL]); $path_words = UrlParser::getWordsLastPathPartUrl($site[self::URL]); $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION]; if ($site[self::TITLE] != "") { $lang = guessLocaleFromString($site[self::TITLE], $lang); } else { $lang = guessLocaleFromString(substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $lang); } $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); $len = strlen($phrase_string); if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) { $meta_ids[] = "safe:true"; $safe = true; } else { $meta_ids[] = "safe:false"; $safe = false; } } if (!isset($word_lists)) { $word_lists = array(); } $data["EXTRACTED_WORDS"] = wordwrap($parent->clean(print_r($word_lists, true), "string"), 75, "\n", true); $data["EXTRACTED_META_WORDS"] = wordwrap($parent->clean(print_r($meta_ids, true), "string"), 75, "\n", true); } return $data; }
/** * Splits an http response document into the http headers sent * and the web page returned. Parses out useful information from * the header and return an array of these two parts and the useful info. * * @param string& $header_and_page reference to string of downloaded data * @param string $value field to store the page portion of page * @return array info array consisting of a header, page for an http * response, as well as parsed from the header the server, server * version, operating system, encoding, and date information. */ static function parseHeaderPage(&$header_and_page, $value = CrawlConstants::PAGE) { $cache_page_validators = array(); $cache_page_validators['etag'] = -1; $cache_page_validators['expires'] = -1; $new_offset = 0; // header will include all redirect headers $site = array(); $site[CrawlConstants::LOCATION] = array(); do { $continue = false; $CRLFCRLF = strpos($header_and_page, "\r\n\r\n", $new_offset); $LFLF = strpos($header_and_page, "\n\n", $new_offset); //either two CRLF (what spec says) or two LF's to be safe $old_offset = $new_offset; $header_offset = $CRLFCRLF > 0 ? $CRLFCRLF : $LFLF; $header_offset = $header_offset ? $header_offset : 0; $new_offset = $CRLFCRLF > 0 ? $header_offset + 4 : $header_offset + 2; $redirect_pos = stripos($header_and_page, 'Location:', $old_offset); $redirect_str = "Location:"; if ($redirect_pos === false) { $redirect_pos = stripos($header_and_page, 'Refresh:', $old_offset); $redirect_str = "Refresh:"; } if (isset($header_and_page[$redirect_pos - 1]) && ord($header_and_page[$redirect_pos - 1]) > 32) { $redirect_pos = $new_offset; //ignore X-XRDS-Location header } else { if ($redirect_pos !== false && $redirect_pos < $new_offset) { $redirect_pos += strlen($redirect_str); $pre_line = substr($header_and_page, $redirect_pos, strpos($header_and_page, "\n", $redirect_pos) - $redirect_pos); $loc = @trim($pre_line); if (strlen($loc) > 0) { $site[CrawlConstants::LOCATION][] = @$loc; } $continue = true; } } } while ($continue); if ($header_offset > 0) { $site[CrawlConstants::HEADER] = substr($header_and_page, 0, $header_offset); $site[$value] = ltrim(substr($header_and_page, $header_offset)); } else { //header message no body; maybe 301? $site[CrawlConstants::HEADER] = $header_and_page; $site[$value] = " "; } $lines = explode("\n", $site[CrawlConstants::HEADER]); $first_line = array_shift($lines); $response = preg_split("/(\\s+)/", $first_line); $site[CrawlConstants::HTTP_CODE] = @trim($response[1]); $site[CrawlConstants::ROBOT_METAS] = array(); foreach ($lines as $line) { $line = trim($line); if (stristr($line, 'Server:')) { $server_parts = preg_split("/Server\\:/i", $line); $server_name_parts = @explode("/", $server_parts[1]); $site[CrawlConstants::SERVER] = @trim($server_name_parts[0]); if (isset($server_name_parts[1])) { $version_parts = explode("(", $server_name_parts[1]); $site[CrawlConstants::SERVER_VERSION] = @trim($version_parts[0]); if (isset($version_parts[1])) { $os_parts = explode(")", $version_parts[1]); $site[CrawlConstants::OPERATING_SYSTEM] = @trim($os_parts[0]); } } } if (stristr($line, 'Content-type:')) { list(, $mimetype, ) = preg_split("/:|;/i", $line); $site[CrawlConstants::TYPE] = trim($mimetype); } if (stristr($line, 'charset=')) { $line_parts = preg_split("/charset\\=/i", $line); $site[CrawlConstants::ENCODING] = strtoupper(@trim($line_parts[1])); } if (stristr($line, 'Last-Modified:')) { $line_parts = preg_split("/Last\\-Modified\\:/i", $line); $site[CrawlConstants::MODIFIED] = strtotime(@trim($line_parts[1])); } if (stristr($line, 'X-Robots-Tag:')) { // robot directives pdfs etc $line_parts = preg_split("/X\\-Robots\\-Tag\\:/i", $line); $robot_metas = explode(",", $line_parts[1]); foreach ($robot_metas as $robot_meta) { $site[CrawlConstants::ROBOT_METAS][] = strtoupper(trim($robot_meta)); } } $canonical_regex = "/Link\\:\\s*\\<\\s*(http.*)\\s*\\>\\s*\\;\\s*" . "rel\\s*\\=\\s*(\"|')?canonical(\"|')?/"; // levenshtein gives notices on strings longer than 255 if (preg_match($canonical_regex, $line, $matches) && isset($site[CrawlConstants::URL]) && strlen($matches[1]) < 252 && (strlen($site[CrawlConstants::URL]) >= 255 || levenshtein($matches[1], $site[CrawlConstants::URL]) > 3)) { // for rel canonical headers $site[CrawlConstants::LOCATION][] = $matches[1]; $site[CrawlConstants::ROBOT_METAS][] = 'NOFOLLOW'; } if (USE_ETAG_EXPIRES && stristr($line, 'ETag:')) { $line_parts = preg_split("/ETag\\:/i", $line); if (isset($line_parts[1])) { $etag_data = explode(" ", $line_parts[1]); if (isset($etag_data[1])) { $etag = $etag_data[1]; $cache_page_validators['etag'] = $etag; } } } if (USE_ETAG_EXPIRES && stristr($line, 'Expires:')) { $line_parts = preg_split("/Expires\\:/i", $line); $all_dates = $line_parts[1]; $date_parts = explode(",", $all_dates); if (count($date_parts) == 2) { $cache_page_validators['expires'] = strtotime($date_parts[1]); } else { if (count($date_parts) > 2) { /*Encountered some pages with more than one Expires date :O */ $timestamps = array(); for ($i = 1; $i < count($date_parts); $i += 2) { $ds = strtotime($date_parts[$i]); $timestamps[] = $ds; } $lowest = min($timestamps); $cache_page_validators['expires'] = $lowest; } } } if (USE_ETAG_EXPIRES && !($cache_page_validators['etag'] == -1 && $cache_page_validators['expires'] == -1)) { $site[CrawlConstants::CACHE_PAGE_VALIDATORS] = $cache_page_validators; } } /* If the doc is HTML and it uses a http-equiv to set the encoding then we override what the server says (if anything). As we are going to convert to UTF-8 we remove the charset info from the meta tag so cached pages will display correctly and redirects without char encoding won't be given a different hash. */ $encoding_info = guessEncodingHtml($site[$value], true); if (is_array($encoding_info)) { list($site[CrawlConstants::ENCODING], $start_charset, $len_c) = $encoding_info; $site[$value] = substr_replace($site[$value], "", $start_charset, $len_c); } else { $site[CrawlConstants::ENCODING] = $encoding_info; } if (!isset($site[CrawlConstants::SERVER])) { $site[CrawlConstants::SERVER] = "unknown"; } return $site; }