/** * Handles admin request related to controlling file options to be used * in a crawl * * This activity allows a user to specify the page range size to be * be used during a crawl as well as which file types can be downloaded */ function pageOptions() { global $INDEXED_FILE_TYPES; /* get processors for different file types (populating $INDEXED_FILE_TYPES) */ foreach (glob(BASE_DIR . "/lib/processors/*_processor.php") as $filename) { require_once $filename; } $parent = $this->parent; $crawl_model = $parent->model("crawl"); $profile_model = $parent->model("profile"); $data["ELEMENT"] = "pageoptions"; $data['SCRIPT'] = ""; $machine_urls = $parent->model("machine")->getQueueServerUrls(); $num_machines = count($machine_urls); if ($num_machines < 1 || $num_machines == 1 && UrlParser::isLocalhostUrl($machine_urls[0])) { $machine_urls = NULL; } $data['available_options'] = array(tl('crawl_component_use_below'), tl('crawl_component_use_defaults')); $crawls = $crawl_model->getCrawlList(false, true, $machine_urls); $data['options_default'] = tl('crawl_component_use_below'); foreach ($crawls as $crawl) { if (strlen($crawl['DESCRIPTION']) > 0) { $data['available_options'][$crawl['CRAWL_TIME']] = $crawl['DESCRIPTION']; } } $seed_info = $crawl_model->getSeedInfo(); $data['RECRAWL_FREQS'] = array(-1 => tl('crawl_component_recrawl_never'), 1 => tl('crawl_component_recrawl_1day'), 2 => tl('crawl_component_recrawl_2day'), 3 => tl('crawl_component_recrawl_3day'), 7 => tl('crawl_component_recrawl_7day'), 14 => tl('crawl_component_recrawl_14day')); $data['SIZE_VALUES'] = array(10000 => 10000, 50000 => 50000, 100000 => 100000, 500000 => 500000, 1000000 => 1000000, 5000000 => 5000000, 10000000 => 10000000); $data['LEN_VALUES'] = array(2000 => 2000, 10000 => 10000, 50000 => 50000, 100000 => 100000, 500000 => 500000, 1000000 => 1000000, 5000000 => 5000000, 10000000 => 10000000); $data['available_summarizers'] = array(self::BASIC_SUMMARIZER => tl('crawl_component_basic'), self::CENTROID_SUMMARIZER => tl('crawl_component_centroid')); if (!isset($seed_info["indexed_file_types"]["extensions"])) { $seed_info["indexed_file_types"]["extensions"] = $INDEXED_FILE_TYPES; } $loaded = false; if (isset($_REQUEST['load_option']) && $_REQUEST['load_option'] > 0) { if ($_REQUEST['load_option'] == 1) { $seed_loaded = $crawl_model->getSeedInfo(true); } else { $timestamp = substr($parent->clean($_REQUEST['load_option'], "int"), 0, TIMESTAMP_LEN); $seed_loaded = $crawl_model->getCrawlSeedInfo($timestamp, $machine_urls); } $copy_options = array("general" => array("page_recrawl_frequency", "page_range_request", "max_description_len", "cache_pages", 'summarizer_option'), "indexed_file_types" => array("extensions"), "indexing_plugins" => array("plugins", "plugins_data")); foreach ($copy_options as $main_option => $sub_options) { foreach ($sub_options as $sub_option) { if (isset($seed_loaded[$main_option][$sub_option])) { $seed_info[$main_option][$sub_option] = $seed_loaded[$main_option][$sub_option]; } } } if (isset($seed_loaded['page_rules'])) { $seed_info['page_rules'] = $seed_loaded['page_rules']; } if (isset($seed_loaded['active_classifiers'])) { $seed_info['active_classifiers'] = $seed_loaded['active_classifiers']; } else { $seed_info['active_classifiers'] = array(); $seed_info['active_classifiers']['label'] = array(); } $loaded = true; } else { $seed_info = $crawl_model->getSeedInfo(); if (isset($_REQUEST["page_recrawl_frequency"]) && in_array($_REQUEST["page_recrawl_frequency"], array_keys($data['RECRAWL_FREQS']))) { $seed_info["general"]["page_recrawl_frequency"] = $_REQUEST["page_recrawl_frequency"]; } if (isset($_REQUEST["page_range_request"]) && in_array($_REQUEST["page_range_request"], $data['SIZE_VALUES'])) { $seed_info["general"]["page_range_request"] = $_REQUEST["page_range_request"]; } if (isset($_REQUEST['summarizer_option']) && in_array($_REQUEST['summarizer_option'], array_keys($data['available_summarizers']))) { $seed_info['general']['summarizer_option'] = $_REQUEST['summarizer_option']; } if (isset($_REQUEST["max_description_len"]) && in_array($_REQUEST["max_description_len"], $data['LEN_VALUES'])) { $seed_info["general"]["max_description_len"] = $_REQUEST["max_description_len"]; } if (isset($_REQUEST["cache_pages"])) { $seed_info["general"]["cache_pages"] = true; } else { if (isset($_REQUEST['posted'])) { //form sent but check box unchecked $seed_info["general"]["cache_pages"] = false; } } if (isset($_REQUEST['page_rules'])) { $seed_info['page_rules']['rule'] = $parent->convertStringCleanArray($_REQUEST['page_rules'], 'rule'); } } if (!isset($seed_info["general"]["page_recrawl_frequency"])) { $seed_info["general"]["page_recrawl_frequency"] = PAGE_RECRAWL_FREQUENCY; } $data['summarizer_option'] = $seed_info['general']['summarizer_option']; $data['PAGE_RECRAWL_FREQUENCY'] = $seed_info["general"]["page_recrawl_frequency"]; if (!isset($seed_info["general"]["cache_pages"])) { $seed_info["general"]["cache_pages"] = false; } $data["CACHE_PAGES"] = $seed_info["general"]["cache_pages"]; if (!isset($seed_info["general"]["page_range_request"])) { $seed_info["general"]["page_range_request"] = PAGE_RANGE_REQUEST; } $data['PAGE_SIZE'] = $seed_info["general"]["page_range_request"]; if (!isset($seed_info["general"]["max_description_len"])) { $seed_info["general"]["max_description_len"] = MAX_DESCRIPTION_LEN; } $data['MAX_LEN'] = $seed_info["general"]["max_description_len"]; $data['INDEXING_PLUGINS'] = array(); $included_plugins = array(); if (isset($_REQUEST["posted"]) && !$loaded) { $seed_info['indexing_plugins']['plugins'] = isset($_REQUEST["INDEXING_PLUGINS"]) ? $_REQUEST["INDEXING_PLUGINS"] : array(); } $included_plugins = isset($seed_info['indexing_plugins']['plugins']) ? $seed_info['indexing_plugins']['plugins'] : array(); foreach ($parent->indexing_plugins as $plugin) { $plugin_name = ucfirst($plugin); $data['INDEXING_PLUGINS'][$plugin_name]['checked'] = in_array($plugin_name, $included_plugins) ? "checked='checked'" : ""; /* to use method_exists we want that the require_once for the plugin class has occurred so we instantiate the object via the plugin method call which will also do the require if needed. */ $plugin_object = $parent->plugin(lcfirst($plugin_name)); $class_name = $plugin_name . "Plugin"; if ($loaded && method_exists($class_name, 'setConfiguration') && method_exists($class_name, 'loadDefaultConfiguration')) { if (isset($seed_info['indexing_plugins']['plugins_data'][$plugin_name])) { $plugin_object->setConfiguration($seed_info['indexing_plugins']['plugins_data'][$plugin_name]); } else { $plugin_object->loadDefaultConfiguration(); } $plugin_object->saveConfiguration(); } if (method_exists($class_name, 'configureHandler') && method_exists($class_name, 'configureView')) { $data['INDEXING_PLUGINS'][$plugin_name]['configure'] = true; $plugin_object->configureHandler($data); } else { $data['INDEXING_PLUGINS'][$plugin_name]['configure'] = false; } } $profile = $profile_model->getProfile(WORK_DIRECTORY); if (!isset($_REQUEST['load_option'])) { $data = array_merge($data, $profile); } else { $parent->updateProfileFields($data, $profile, array('IP_LINK', 'CACHE_LINK', 'SIMILAR_LINK', 'IN_LINK', 'SIGNIN_LINK', 'SUBSEARCH_LINK', 'WORD_SUGGEST')); } $weights = array('TITLE_WEIGHT' => 4, 'DESCRIPTION_WEIGHT' => 1, 'LINK_WEIGHT' => 2, 'MIN_RESULTS_TO_GROUP' => 200, 'SERVER_ALPHA' => 1.6); $change = false; foreach ($weights as $weight => $value) { if (isset($_REQUEST[$weight])) { $data[$weight] = $parent->clean($_REQUEST[$weight], 'float', 1); $profile[$weight] = $data[$weight]; $change = true; } else { if (isset($profile[$weight]) && $profile[$weight] != "") { $data[$weight] = $profile[$weight]; } else { $data[$weight] = $value; $profile[$weight] = $data[$weight]; $change = true; } } } if ($change == true) { $profile_model->updateProfile(WORK_DIRECTORY, array(), $profile); } $data['INDEXED_FILE_TYPES'] = array(); $filetypes = array(); foreach ($INDEXED_FILE_TYPES as $filetype) { $ison = false; if (isset($_REQUEST["filetype"]) && !$loaded) { if (isset($_REQUEST["filetype"][$filetype])) { $filetypes[] = $filetype; $ison = true; $change = true; } } else { if (in_array($filetype, $seed_info["indexed_file_types"]["extensions"])) { $filetypes[] = $filetype; $ison = true; } } $data['INDEXED_FILE_TYPES'][$filetype] = $ison ? "checked='checked'" : ''; } $seed_info["indexed_file_types"]["extensions"] = $filetypes; $data['CLASSIFIERS'] = array(); $data['RANKERS'] = array(); $active_classifiers = array(); $active_rankers = array(); foreach (Classifier::getClassifierList() as $classifier) { $label = $classifier->class_label; $ison = false; if (isset($_REQUEST['classifier']) && !$loaded) { if (isset($_REQUEST['classifier'][$label])) { $ison = true; } } else { if ($loaded || !isset($_REQUEST['posted']) && isset($seed_info['active_classifiers']['label'])) { if (in_array($label, $seed_info['active_classifiers']['label'])) { $ison = true; } } } if ($ison) { $data['CLASSIFIERS'][$label] = 'checked="checked"'; $active_classifiers[] = $label; } else { $data['CLASSIFIERS'][$label] = ''; } $ison = false; if (isset($_REQUEST['ranker']) && !$loaded) { if (isset($_REQUEST['ranker'][$label])) { $ison = true; } } else { if ($loaded || !isset($_REQUEST['posted']) && isset($seed_info['active_rankers']['label'])) { if (isset($seed_info['active_rankers']['label']) && in_array($label, $seed_info['active_rankers']['label'])) { $ison = true; } } } if ($ison) { $data['RANKERS'][$label] = 'checked="checked"'; $active_rankers[] = $label; } else { $data['RANKERS'][$label] = ''; } } $parent->pagingLogic($data, 'CLASSIFIERS', 'CLASSIFIERS', DEFAULT_ADMIN_PAGING_NUM / 5, array(), "", array('name' => 'class_label')); $seed_info['active_classifiers']['label'] = $active_classifiers; $seed_info['active_rankers']['label'] = $active_rankers; if (isset($seed_info['page_rules']['rule'])) { if (isset($seed_info['page_rules']['rule']['rule'])) { $data['page_rules'] = $parent->convertArrayLines($seed_info['page_rules']['rule']['rule']); } else { $data['page_rules'] = $parent->convertArrayLines($seed_info['page_rules']['rule']); } } else { $data['page_rules'] = ""; } $allowed_options = array('crawl_time', 'search_time', 'test_options'); if (isset($_REQUEST['option_type']) && in_array($_REQUEST['option_type'], $allowed_options)) { $data['option_type'] = $_REQUEST['option_type']; } else { $data['option_type'] = 'crawl_time'; } if ($data['option_type'] == 'crawl_time') { $data['crawl_time_active'] = "active"; $data['search_time_active'] = ""; $data['test_options_active'] = ""; $data['SCRIPT'] .= "\nswitchTab('crawltimetab'," . "'searchtimetab', 'testoptionstab')\n"; } else { if ($data['option_type'] == 'search_time') { $data['search_time_active'] = "active"; $data['crawl_time_active'] = ""; $data['test_options_active'] = ""; $data['SCRIPT'] .= "\nswitchTab('searchtimetab'," . "'crawltimetab', 'testoptionstab')\n"; } else { $data['search_time_active'] = ""; $data['crawl_time_active'] = ""; $data['test_options_active'] = "active"; $data['SCRIPT'] .= "\nswitchTab('testoptionstab'," . "'crawltimetab', 'searchtimetab');\n"; } } $crawl_model->setSeedInfo($seed_info); if ($change == true && $data['option_type'] != 'test_options') { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_page_options_updated') . "</h1>')"; } $test_processors = array("text/html" => "html", "text/asp" => "html", "text/xml" => "xml", "text/robot" => "robot", "application/xml" => "xml", "application/xhtml+xml" => "html", "application/rss+xml" => "rss", "application/atom+xml" => "rss", "text/csv" => "text", "text/gopher" => "gopher", "text/plain" => "text", "text/rtf" => "rtf", "text/tab-separated-values" => "text"); $data['MIME_TYPES'] = array_keys($test_processors); $data['page_type'] = "text/html"; if (isset($_REQUEST['page_type']) && in_array($_REQUEST['page_type'], $data['MIME_TYPES'])) { $data['page_type'] = $_REQUEST['page_type']; } $data['TESTPAGE'] = isset($_REQUEST['TESTPAGE']) ? $parent->clean($_REQUEST['TESTPAGE'], 'string') : ""; if ($data['option_type'] == 'test_options' && $data['TESTPAGE'] != "") { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_page_options_running_tests') . "</h1>')"; $site = array(); $site[self::ENCODING] = "UTF-8"; $site[self::URL] = "http://test-site.yioop.com/"; $site[self::IP_ADDRESSES] = array("1.1.1.1"); $site[self::HTTP_CODE] = 200; $site[self::MODIFIED] = date("U", time()); $site[self::TIMESTAMP] = time(); $site[self::TYPE] = "text/html"; $site[self::HEADER] = "page options test extractor"; $site[self::SERVER] = "unknown"; $site[self::SERVER_VERSION] = "unknown"; $site[self::OPERATING_SYSTEM] = "unknown"; $site[self::LANG] = 'en'; $site[self::JUST_METAS] = false; if (isset($_REQUEST['page_type']) && in_array($_REQUEST['page_type'], $data['MIME_TYPES'])) { $site[self::TYPE] = $_REQUEST['page_type']; } if ($site[self::TYPE] == 'text/html') { $site[self::ENCODING] = guessEncodingHtml($_REQUEST['TESTPAGE']); } $prefix_name = $test_processors[$site[self::TYPE]]; $processor_name = ucfirst($prefix_name) . "Processor"; $plugin_processors = array(); if (isset($seed_info['indexing_plugins']['plugins'])) { foreach ($seed_info['indexing_plugins']['plugins'] as $plugin) { $plugin_name = $plugin . "Plugin"; $supported_processors = $plugin_name::getProcessors(); foreach ($supported_processors as $supported_processor) { $parent_processor = $processor_name; do { if ($supported_processor == $parent_processor) { $plugin_object = $parent->plugin(lcfirst($plugin)); if (method_exists($plugin_name, "loadConfiguration")) { $plugin_object->loadConfiguration(); } $plugin_processors[] = $plugin_object; break; } } while (($parent_processor = get_parent_class($parent_processor)) && $parent_processor != "PageProcessor"); } } } $page_processor = new $processor_name($plugin_processors, $seed_info["general"]["max_description_len"], $seed_info["general"]["summarizer_option"]); restore_error_handler(); $data["PAGE_RANGE_REQUEST"] = $seed_info["general"]["page_range_request"]; $doc_info = $page_processor->handle(substr($_REQUEST['TESTPAGE'], 0, $data["PAGE_RANGE_REQUEST"]), $site[self::URL]); set_error_handler("yioop_error_handler"); if (!$doc_info) { $data["AFTER_PAGE_PROCESS"] = ""; $data["AFTER_RULE_PROCESS"] = ""; $data["EXTRACTED_WORDS"] = ""; $data["EXTRACTED_META_WORDS"] = ""; return $data; } if ($processor_name != "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) { $doc_info[self::LINKS] = UrlParser::pruneLinks($doc_info[self::LINKS]); } foreach ($doc_info as $key => $value) { $site[$key] = $value; } if (isset($site[self::PAGE])) { unset($site[self::PAGE]); } if (isset($site[self::ROBOT_PATHS])) { $site[self::JUST_METAS] = true; } $reflect = new ReflectionClass("CrawlConstants"); $crawl_constants = $reflect->getConstants(); $crawl_keys = array_keys($crawl_constants); $crawl_values = array_values($crawl_constants); $inverse_constants = array_combine($crawl_values, $crawl_keys); $after_process = array(); foreach ($site as $key => $value) { $out_key = isset($inverse_constants[$key]) ? $inverse_constants[$key] : $key; $after_process[$out_key] = $value; } $data["AFTER_PAGE_PROCESS"] = wordwrap($parent->clean(print_r($after_process, true), "string"), 75, "\n", true); $rule_string = implode("\n", $seed_info['page_rules']['rule']); $rule_string = html_entity_decode($rule_string, ENT_QUOTES); $page_rule_parser = new PageRuleParser($rule_string); $page_rule_parser->executeRuleTrees($site); $after_process = array(); foreach ($site as $key => $value) { $out_key = isset($inverse_constants[$key]) ? $inverse_constants[$key] : $key; $after_process[$out_key] = $value; } $data["AFTER_RULE_PROCESS"] = wordwrap($parent->clean(print_r($after_process, true), "string"), 75, "\n", true); $lang = NULL; if (isset($site[self::LANG])) { $lang = $site[self::LANG]; } $meta_ids = PhraseParser::calculateMetas($site); if (!$site[self::JUST_METAS]) { $host_words = UrlParser::getWordsIfHostUrl($site[self::URL]); $path_words = UrlParser::getWordsLastPathPartUrl($site[self::URL]); $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION]; if ($site[self::TITLE] != "") { $lang = guessLocaleFromString($site[self::TITLE], $lang); } else { $lang = guessLocaleFromString(substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $lang); } $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); $len = strlen($phrase_string); if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) { $meta_ids[] = "safe:true"; $safe = true; } else { $meta_ids[] = "safe:false"; $safe = false; } } if (!isset($word_lists)) { $word_lists = array(); } $data["EXTRACTED_WORDS"] = wordwrap($parent->clean(print_r($word_lists, true), "string"), 75, "\n", true); $data["EXTRACTED_META_WORDS"] = wordwrap($parent->clean(print_r($meta_ids, true), "string"), 75, "\n", true); } return $data; }
/** * Used to check if a line countains a word associated with a province, * state or major city. * * @param string $line from address to check * @return bool whether it contains acountry term */ function checkRegion($line) { $locale = guessLocaleFromString($line); $line_parts = explode(" ", $line); if (in_array($locale, array("zh-CN", "ja", "ko"))) { //Chinese, Japanese or Korean so chargram size 2. $line_parts = array(); $len = mb_strlen($line); for ($i = 0; $i < $len; $i++) { $line_parts[] = mb_substr($line, $i, 2); } } foreach ($line_parts as $part) { $part = mb_strtoupper($part); if (in_array($part, $this->regions)) { return true; } } return false; }
/** * Builds an inverted index shard (word --> {docs it appears in}) * for the current batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages. * This inverted index shard is then merged by a queue_server * into the inverted index of the current generation of the crawl. * The complete inverted index for the whole crawl is built out of these * inverted indexes for generations. The point of computing a partial * inverted index on the fetcher is to reduce some of the computational * burden on the queue server. The resulting mini index computed by * buildMiniInvertedIndex() is stored in * $this->found_sites[self::INVERTED_INDEX] * */ function buildMiniInvertedIndex() { $start_time = microtime(); $keypad = ""; crawlLog(" Start building mini inverted index ... Current Memory:" . memory_get_usage()); $num_seen = count($this->found_sites[self::SEEN_URLS]); $this->num_seen_sites += $num_seen; /* for the fetcher we are not saving the index shards so name doesn't matter. */ if (!isset($this->found_sites[self::INVERTED_INDEX][$this->current_server])) { $this->found_sites[self::INVERTED_INDEX][$this->current_server] = new IndexShard("fetcher_shard_{$this->current_server}"); } for ($i = 0; $i < $num_seen; $i++) { $interim_time = microtime(); $site = $this->found_sites[self::SEEN_URLS][$i]; if (!isset($site[self::HASH]) || isset($site[self::ROBOT_METAS]) && in_array("JUSTFOLLOW", $site[self::ROBOT_METAS])) { continue; } $doc_rank = false; if ($this->crawl_type == self::ARCHIVE_CRAWL && isset($this->archive_iterator)) { $doc_rank = $this->archive_iterator->weight($site); } if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") { $is_link = true; $doc_keys = $site[self::HTTP_CODE]; $site_url = $site[self::TITLE]; $host = UrlParser::getHost($site_url); $link_parts = explode('|', $site[self::HASH]); if (isset($link_parts[5])) { $link_origin = $link_parts[5]; } else { $link_origin = $site_url; } $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin); } else { $is_link = false; $site_url = str_replace('|', "%7C", $site[self::URL]); $host = UrlParser::getHost($site_url); $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1); $meta_ids = PhraseParser::calculateMetas($site, $this->video_sources); } $word_lists = array(); /* self::JUST_METAS check to avoid getting sitemaps in results for popular words */ $lang = NULL; if (!isset($site[self::JUST_METAS])) { $host_words = UrlParser::getWordsIfHostUrl($site_url); $path_words = UrlParser::getWordsLastPathPartUrl($site_url); if ($is_link) { $phrase_string = $site[self::DESCRIPTION]; } else { if (isset($site[self::LANG])) { if (isset($this->programming_language_extension[$site[self::LANG]])) { $phrase_string = $site[self::DESCRIPTION]; } else { $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION]; } } else { $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION]; } } if (isset($site[self::LANG])) { $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]); } $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); $len = strlen($phrase_string); if (isset($this->programming_language_extension[$lang]) || PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) { $meta_ids[] = "safe:true"; $safe = true; } else { $meta_ids[] = "safe:false"; $safe = false; } } if (!$is_link) { //store inlinks so they can be searched by $num_links = count($site[self::LINKS]); if ($num_links > 0) { $link_rank = false; if ($doc_rank !== false) { $link_rank = max($doc_rank - 1, 1); } } else { $link_rank = false; } } $num_queue_servers = count($this->queue_servers); if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) { $score_keys = ""; foreach ($site[self::USER_RANKS] as $label => $score) { $score_keys .= packInt($score); } if (strlen($score_keys) % 8 != 0) { $score_keys .= $keypad; } $doc_keys .= $score_keys; } $this->found_sites[self::INVERTED_INDEX][$this->current_server]->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, $doc_rank); /* $this->no_process_links is set when doing things like mix recrawls. In this case links likely already will appear in what indexing, so don't index again. $site[self::JUST_META] is set when have a sitemap or robots.txt (this case set later). In this case link info is not particularly useful for indexing and can greatly slow building inverted index. */ if (!$this->no_process_links && !isset($site[self::JUST_METAS]) && !isset($this->programming_language_extension[$lang])) { foreach ($site[self::LINKS] as $url => $link_text) { /* this mysterious check means won't index links from robots.txt. Sitemap will still be in TO_CRAWL, but that's done elsewhere */ if (strlen($url) == 0 || is_numeric($url)) { continue; } $link_host = UrlParser::getHost($url); if (strlen($link_host) == 0) { continue; } $part_num = calculatePartition($link_host, $num_queue_servers); $summary = array(); if (!isset($this->found_sites[self::LINK_SEEN_URLS][$part_num])) { $this->found_sites[self::LINK_SEEN_URLS][$part_num] = array(); } $elink_flag = $link_host != $host ? true : false; $link_text = strip_tags($link_text); $ref = $elink_flag ? "eref" : "iref"; $url = str_replace('|', "%7C", $url); $link_id = "url|" . $url . "|text|" . urlencode($link_text) . "|{$ref}|" . $site_url; $elink_flag_string = $elink_flag ? "e" : "i"; $link_keys = crawlHash($url, true) . crawlHash($link_id, true) . $elink_flag_string . substr(crawlHash($host . "/", true), 1); $summary[self::URL] = $link_id; $summary[self::TITLE] = $url; // stripping html to be on the safe side $summary[self::DESCRIPTION] = $link_text; $summary[self::TIMESTAMP] = $site[self::TIMESTAMP]; $summary[self::ENCODING] = $site[self::ENCODING]; $summary[self::HASH] = $link_id; $summary[self::TYPE] = "link"; $summary[self::HTTP_CODE] = $link_keys; $summary[self::LANG] = $lang; $this->found_sites[self::LINK_SEEN_URLS][$part_num][] = $summary; $link_word_lists = PhraseParser::extractPhrasesInLists($link_text, $lang); $link_meta_ids = PhraseParser::calculateLinkMetas($url, $link_host, $link_text, $site_url); if (!isset($this->found_sites[self::INVERTED_INDEX][$part_num])) { $this->found_sites[self::INVERTED_INDEX][$part_num] = new IndexShard("fetcher_shard_{$part_num}"); } $this->found_sites[self::INVERTED_INDEX][$part_num]->addDocumentWords($link_keys, self::NEEDS_OFFSET_FLAG, $link_word_lists, $link_meta_ids, PhraseParser::$materialized_metas, false, $link_rank); } } $iterim_elapse = changeInMicrotime($interim_time); if ($iterim_elapse > 5) { crawlLog("..Inverting " . $site[self::URL] . "...took > 5s."); } crawlTimeoutLog("..Still building inverted index. Have processed " . "%s of %s documents.\nLast url processed was %s.", $i, $num_seen, $site[self::URL]); } if ($this->crawl_type == self::ARCHIVE_CRAWL) { $this->recrawl_check_scheduler = true; } crawlLog(" Build mini inverted index time " . changeInMicrotime($start_time)); }
/** * Gets doc summaries of documents containing given words and meeting the * additional provided criteria * @param array $word_structs an array of word_structs. Here a word_struct * is an associative array with at least the following fields * KEYS -- an array of word keys * QUOTE_POSITIONS -- an array of positions of words that appeared in * quotes (so need to be matched exactly) * DISALLOW_PHRASES -- an array of words the document must not contain * WEIGHT -- a weight to multiple scores returned from this iterator by * INDEX_NAME -- an index timestamp to get results from * @param int $limit number of first document in order to return * @param int $num number of documents to return summaries of * @param array& $filter an array of hashes of domains to filter from * results * @param bool $use_cache_if_allowed if true and USE_CACHE is true then * an attempt will be made to look up the results in either * the file cache or memcache. Otherwise, items will be recomputed * and then potentially restored in cache * @param int $raw ($raw == 0) normal grouping, ($raw > 0) * no grouping done on data. if ($raw == 1) no lookups of summaries * done * @param array $queue_servers a list of urls of yioop machines which might * be used during lookup * @param string $original_query if set, the original query that corresponds * to $word_structs * @param string $save_timestamp_name if this timestamp is not empty, then * save iterate position, so can resume on future queries that make * use of the timestamp. If used then $limit ignored and get next $num * docs after $save_timestamp 's previous iterate position. * @param bool $limit_news if true the number of media:news items to * allow in search results is limited to WordIterator::LIMIT_NEWS_COUNT * * @return array document summaries */ function getSummariesByHash($word_structs, $limit, $num, &$filter, $use_cache_if_allowed = true, $raw = 0, $queue_servers = array(), $original_query = "", $save_timestamp_name = "", $limit_news = true) { global $CACHE; $indent = " "; $in2 = $indent . $indent; $in3 = $in2 . $indent; $in4 = $in2 . $in2; if (QUERY_STATISTICS) { $lookup_time = microtime(); } $use_proximity = false; $time = time(); if (count($word_structs) > 1 || isset($word_structs[0]["KEYS"]) && count($word_structs[0]["KEYS"]) > 1 || $word_structs == array() && substr_count($original_query, " ") > 1) { $use_proximity = true; } if (!isset($filter['time'])) { $filter['time'] = 0; } $filter_time = $filter['time']; unset($filter['time']); //iterators don't expect time field $pages = array(); $generation = 0; $to_retrieve = ceil(($limit + $num) / self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES; $start_slice = floor($limit / self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES; if ($save_timestamp_name != "") { $to_retrieve = $num; $limit = 0; $start_slice = 0; } if (USE_CACHE && $save_timestamp_name == "") { $mem_tmp = serialize($raw) . serialize($word_structs) . $original_query . $this->index_name; $summary_hash = crawlHash($mem_tmp . ":" . $limit . ":" . $num); if ($use_cache_if_allowed) { $cache_success = true; $results = $CACHE->get($summary_hash); if (!isset($results['TIME']) || $filter_time > $results['TIME']) { //if filter has changed since cached, then invalidate cache $results = false; } if (isset($results['TIME'])) { $cached_time = $time - $results['TIME']; } else { $cached_time = $time; } if ($cached_time > MAX_QUERY_CACHE_TIME) { $results = false; } if (isset($results['PAGES'])) { $close_prefix = WORK_DIRECTORY . "/schedules/" . self::index_closed_name; $has_changeable_results = false; $seen_times = array(); foreach ($results['PAGES'] as $page) { if (!isset($page[self::CRAWL_TIME]) || in_array($page[self::CRAWL_TIME], $seen_times)) { continue; } $seen_times[] = $page[self::CRAWL_TIME]; $current_closed = $close_prefix . $page[self::CRAWL_TIME] . ".txt"; if (!file_exists($current_closed)) { //either feed result or from active crawl $has_changeable_results = true; break; } } if ($has_changeable_results) { if ($cached_time > MIN_QUERY_CACHE_TIME) { $results = false; } } } if (QUERY_STATISTICS) { $this->query_info['QUERY'] .= "{$in2}<b>Cache Lookup Time</b>: " . changeInMicrotime($lookup_time) . "<br />"; } if ($results !== false) { return $results; } } } $old_to_retrieve = $to_retrieve; $query_iterator = $this->getQueryIterator($word_structs, $filter, $raw, $to_retrieve, $queue_servers, $original_query, $save_timestamp_name, $limit_news); $num_retrieved = 0; $pages = array(); if (is_object($query_iterator)) { while ($num_retrieved < $to_retrieve && is_array($next_docs = $query_iterator->nextDocsWithWord())) { $pages += $next_docs; $num_retrieved = count($pages); } } if ($save_timestamp_name != "" && ($queue_servers == array() || $this->isSingleLocalhost($queue_servers))) { // used for archive crawls of crawl mixes $save_file = CRAWL_DIR . '/schedules/' . self::save_point . $save_timestamp_name . ".txt"; $iterators = $query_iterator->save_iterators; $cnt_iterators = count($iterators); $save_point = array(); for ($i = 0; $i < $cnt_iterators; $i++) { $save_point[$i] = $iterators[$i]->currentGenDocOffsetWithWord(); } $results["SAVE_POINT"] = $save_point; file_put_contents($save_file, serialize($save_point)); $this->db->setWorldPermissionsRecursive($save_file); } $pages = array_values($pages); $result_count = count($pages); $sort_time = 0; if ($raw == 0) { // initialize scores $sort_start = microtime(); $max_user_ranks = 0; for ($i = 0; $i < $result_count; $i++) { $pages[$i]["OUT_SCORE"] = 0; if (isset($pages[$i][self::USER_RANKS])) { $j = count($pages[$i][self::USER_RANKS]); if ($max_user_ranks < $j) { $max_user_ranks = $j; } } } if ($max_user_ranks > 0) { for ($i = 0; $i < $result_count; $i++) { for ($j = 0; $j < $max_user_ranks; $j++) { if (isset($pages[$i][self::USER_RANKS][$j])) { $pages[$i]["USCORE{$j}"] = $pages[$i][self::USER_RANKS][$j]; } else { $pages[$i]["USCORE{$j}"] = 0; } } } } $subscore_fields = array(self::DOC_RANK, self::RELEVANCE); if ($use_proximity) { $subscore_fields[] = self::PROXIMITY; } if ($max_user_ranks > 0) { for ($j = 0; $j < $max_user_ranks; $j++) { $subscore_fields[] = "USCORE{$j}"; } } $num_fields = count($subscore_fields); // Compute Reciprocal Rank Fusion Score $alpha = 600 / $num_fields; if (isset($pages[0])) { foreach ($subscore_fields as $field) { orderCallback($pages[0], $pages[0], $field); usort($pages, "orderCallback"); $score = 0; for ($i = 0; $i < $result_count; $i++) { if ($i > 0) { if ($pages[$i - 1][$field] != $pages[$i][$field]) { $score++; } } $pages[$i]["OUT_SCORE"] += $alpha / (59 + $score); } } orderCallback($pages[0], $pages[0], "OUT_SCORE"); } usort($pages, "orderCallback"); if ($use_proximity) { for ($i = 0; $i < $result_count; $i++) { $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"]; } } else { for ($i = 0; $i < $result_count; $i++) { $pages[$i][self::PROXIMITY] = 1; $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"]; } } $sort_time = changeInMicrotime($sort_start); } if ($num_retrieved < $to_retrieve) { $results['TOTAL_ROWS'] = $num_retrieved; } else { $results['TOTAL_ROWS'] = $query_iterator->num_docs; //this is only an approximation } if ($raw == 1 && $save_timestamp_name == "") { $pages = array_slice($pages, $start_slice); $pages = array_slice($pages, $limit - $start_slice, $num); $results['PAGES'] =& $pages; if ($old_to_retrieve != $to_retrieve) { $results['HARD_QUERY'] = $old_to_retrieve; } return $results; } if (QUERY_STATISTICS) { $this->query_info['QUERY'] .= "{$in2}<b>Lookup Offsets Time</b>: " . changeInMicrotime($lookup_time) . "<br />"; $machine_times = AnalyticsManager::get("MACHINE_TIMES"); if ($machine_times) { $this->query_info['QUERY'] .= "{$in3}<i>Machine Sub-Times</i>:<br />" . $machine_times . "<br />"; } $net_times = AnalyticsManager::get("NET_TIMES"); $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES"); if ($net_times && $max_machine_times) { $this->query_info['QUERY'] .= "{$in3}<i>Network Overhead Sub-Time</i>: " . ($net_times - $max_machine_times) . "<br />"; } if ($sort_time) { $this->query_info['QUERY'] .= "{$in3}<i>Merge-Rank Sub-Time</i>: " . $sort_time . "<br />"; } $summaries_time = microtime(); } $get_pages = array_slice($pages, $limit, $num); $to_get_count = count($get_pages); $groups_with_docs = false; if (preg_match("/\\bsite:doc\\b/", $original_query)) { $groups_with_docs = true; } $out_pages = array(); $cur_limit = $limit; while (count($out_pages) < $to_get_count && $get_pages) { $out_pages = array_merge($out_pages, $this->getSummariesFromOffsets($get_pages, $queue_servers, $raw, $groups_with_docs)); if ($save_timestamp_name != "") { break; } $cur_limit += $num; $get_pages = array_slice($pages, $cur_limit, $num); } $out_pages = array_slice($out_pages, 0, $num); if (QUERY_STATISTICS) { $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES"); if ($summary_times_string) { $round_summary_times = unserialize($summary_times_string); $summary_delta_time = changeInMicrotime($summaries_time); $summary_time_info = "{$summary_delta_time}<br /> {$in4}"; $sum_max_time = 0; foreach ($round_summary_times as $summary_times) { $i = 0; $max_time = 0; foreach ($summary_times as $summary_time) { $summary_time_info .= "ID_{$i}: " . $summary_time . "{$indent}"; $max_time = $summary_time > $max_time ? $summary_time : $max_time; $i++; } $sum_max_time += $max_time; } $net_overhead = $summary_delta_time - $sum_max_time; $summary_time_info .= "<br />{$in3}<i>Network Overhead Sub-Time</i>: " . $net_overhead; } else { $summary_time_info = changeInMicrotime($summaries_time); } $this->query_info['QUERY'] .= "{$in2}<b>Get Summaries Time</b>: " . $summary_time_info . "<br />"; } $results['PAGES'] =& $out_pages; $results['TIME'] = time(); $lang = guessLocaleFromString($original_query); $tokenizer = PhraseParser::getTokenizer($lang); //only use tokenizer if no meta word or disjuncts in query if (!preg_match('/(\\||\\:)/u', $original_query) && $tokenizer && method_exists($tokenizer, "scoredThesaurusMatches") && method_exists($tokenizer, "tagPartsOfSpeechPhrase") && isset($tokenizer->use_thesaurus)) { $results = $this->sortByThesaurusScore($results, $original_query, $lang); } if (USE_CACHE && $save_timestamp_name == "") { $CACHE->set($summary_hash, $results); } return $results; }
/** * Used to recompute both the index shards and the dictionary * of an index archive. The first step involves re-extracting the * word into an inverted index from the summaries' web_archives. * Then a reindex is done. * * @param string $archive_path file path to a IndexArchiveBundle */ function rebuildIndexArchive($archive_path) { $archive_type = $this->getArchiveKind($archive_path); if ($archive_type != "IndexArchiveBundle") { $this->badFormatMessageAndExit($archive_path); } $info = $archive_type::getArchiveInfo($archive_path); $tmp = unserialize($info["DESCRIPTION"]); $video_sources = $tmp[self::VIDEO_SOURCES]; $generation_info = unserialize(file_get_contents("{$archive_path}/generation.txt")); $num_generations = $generation_info['ACTIVE'] + 1; $archive = new WebArchiveBundle($archive_path . "/summaries"); $seen = 0; $generation = 0; $keypad = ""; while ($generation < $num_generations) { $partition = $archive->getPartition($generation, false); $shard_name = $archive_path . "/posting_doc_shards/index{$generation}"; crawlLog("Processing partition {$generation}"); if (file_exists($shard_name)) { crawlLog("..Unlinking old shard {$generation}"); @unlink($shard_name); } $shard = new IndexShard($shard_name, $generation, NUM_DOCS_PER_GENERATION, true); $seen_partition = 0; while ($seen_partition < $partition->count) { $num_to_get = min($partition->count - $seen_partition, 8000); $offset = $partition->iterator_pos; $objects = $partition->nextObjects($num_to_get); $cnt = 0; foreach ($objects as $object) { $cnt++; $site = $object[1]; if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") { $is_link = true; $doc_keys = $site[self::HTTP_CODE]; $site_url = $site[self::TITLE]; $host = UrlParser::getHost($site_url); $link_parts = explode('|', $site[self::HASH]); if (isset($link_parts[5])) { $link_origin = $link_parts[5]; } else { $link_origin = $site_url; } $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin); $link_to = "LINK TO:"; } else { $is_link = false; $site_url = str_replace('|', "%7C", $site[self::URL]); $host = UrlParser::getHost($site_url); $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1); $meta_ids = PhraseParser::calculateMetas($site, $video_sources); $link_to = ""; } $so_far_cnt = $seen_partition + $cnt; $time_out_message = "..still processing {$so_far_cnt} " . "of {$partition->count} in partition {$generation}." . "\n..Last processed was: " . ($seen + 1) . ". {$link_to}{$site_url}. "; crawlTimeoutLog($time_out_message); $seen++; $word_lists = array(); /* self::JUST_METAS check to avoid getting sitemaps in results for popular words */ $lang = NULL; if (!isset($site[self::JUST_METAS])) { $host_words = UrlParser::getWordsIfHostUrl($site_url); $path_words = UrlParser::getWordsLastPathPartUrl($site_url); if ($is_link) { $phrase_string = $site[self::DESCRIPTION]; } else { $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION]; } if (isset($site[self::LANG])) { $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]); } $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); $len = strlen($phrase_string); if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) { $meta_ids[] = "safe:true"; $safe = true; } else { $meta_ids[] = "safe:false"; $safe = false; } } if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) { $score_keys = ""; foreach ($site[self::USER_RANKS] as $label => $score) { $score_keys .= packInt($score); } if (strlen($score_keys) % 8 != 0) { $score_keys .= $keypad; } $doc_keys .= $score_keys; } $shard->addDocumentWords($doc_keys, $offset, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false); $offset = $object[0]; } $seen_partition += $num_to_get; } $shard->save(false, true); $generation++; } $this->reindexIndexArchive($archive_path); }
/** * Used to extract the title, description and links from * a docx file consisting of xml data. * * @param string $page docx(zip) contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $summary = NULL; $sites = array(); $zip = new PartialZipArchive($page); $buf = $zip->getFromName("docProps/core.xml"); if ($buf) { $dom = self::dom($buf); if ($dom !== false) { // Try to get the title from the document meta data $summary[self::TITLE] = self::title($dom); } } $buf = $zip->getFromName("word/document.xml"); if ($buf) { $dom = self::dom($buf); $summary[self::DESCRIPTION] = self::docText($dom); $summary[self::LANG] = guessLocaleFromString($summary[self::DESCRIPTION], 'en-US'); } else { $summary[self::DESCRIPTION] = "Did not download " . "word/document.xml portion of docx file"; $summary[self::LANG] = 'en-US'; } $buf = $zip->getFromName("word/_rels/document.xml.rels"); if ($buf) { $dom = self::dom($buf); $summary[self::LINKS] = self::links($dom, $url); } else { $summary[self::LINKS] = array(); } return $summary; }