/** * Load a StringArray from a file * * @param string $fname the name of the file to load the StringArray from * @return object the PersistentStructure loaded */ static function load($fname) { $fh = fopen($fname, "rb"); $array_size = unpackInt(fread($fh, 4)); $array = fread($fh, $array_size); $object = unserialize(fread($fh, filesize($fname) - 4 - $array_size)); $object->string_array =& $array; fclose($fh); return $object; }
/** * Adds the summary and index data in $file to summary bundle and word index * * @param string $file containing web pages summaries and a mini-inverted * index for their content * @param bool $blocking this method might be called by the indexer * subcomponent when a merge tier phase is ongoing to allow for * other processing to occur. If so, we don't want a regress * where the indexer calls this code calls the indexer etc. If * the blocking flag is set then the indexer subcomponent won't * be called */ function processIndexArchive($file, $blocking) { static $blocked = false; if ($blocking && $blocked) { crawlLog("Indexer waiting for merge tiers to " . "complete before write partition. B"); return; } if (!$blocking) { $blocked = false; } crawlLog("{$this->server_name} is starting to process index data," . " memory usage: " . memory_get_usage() . "..."); crawlLog("Indexer: Processing index data in {$file}..."); $start_time = microtime(); $start_total_time = microtime(); $pre_sites = webdecode(file_get_contents($file)); $len_urls = unpackInt(substr($pre_sites, 0, 4)); $seen_urls_string = substr($pre_sites, 4, $len_urls); $pre_sites = substr($pre_sites, 4 + $len_urls); $sites[self::SEEN_URLS] = array(); $pos = 0; $num = 0; $bad = false; $max_batch_sites_and_links = SEEN_URLS_BEFORE_UPDATE_SCHEDULER * (max(MAX_LINKS_PER_PAGE, MAX_LINKS_PER_SITEMAP) + 1); while ($pos < $len_urls && $num <= $max_batch_sites_and_links) { crawlTimeoutLog("..Indexer still processing index data at position" . " %s of out of %s", $pos, $len_urls); $len_site = unpackInt(substr($seen_urls_string, $pos, 4)); if ($len_site > 2 * $this->page_range_request) { crawlLog("Indexer: Site string too long, {$len_site}," . " data file may be corrupted? Skip rest."); $bad = true; break; } $pos += 4; $site_string = substr($seen_urls_string, $pos, $len_site); $pos += strlen($site_string); $tmp = unserialize(gzuncompress($site_string)); if (!$tmp || !is_array($tmp)) { crawlLog("Compressed array null," . " data file may be corrupted? Skip rest."); $bad = true; break; } $sites[self::SEEN_URLS][] = $tmp; $num++; } if ($num > $max_batch_sites_and_links * SEEN_URLS_BEFORE_UPDATE_SCHEDULER || $bad) { crawlLog("Index data file len_urls was {$len_urls} num was {$num}, " . "may be corrupt so skipping this file."); crawlLog("Indexer Done Index Processing File: {$file}. Total time: " . changeInMicrotime($start_total_time)); unlink($file); return; } crawlLog("A. Indexer Load SEEN_URLS. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time)); $sites[self::INVERTED_INDEX] = IndexShard::load("fetcher_shard", $pre_sites); unset($pre_sites); crawlLog("B. Indexer Load Sent shard. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time)); $start_time = microtime(); //do deduplication of summaries if (isset($sites[self::SEEN_URLS]) && count($sites[self::SEEN_URLS]) > 0) { $seen_sites = $sites[self::SEEN_URLS]; $seen_sites = array_values($seen_sites); unset($sites[self::SEEN_URLS]); $num_seen = count($seen_sites); crawlLog("Indexer: SEEN_URLS array had {$num_seen} sites."); } else { $num_seen = 0; } $visited_urls_count = 0; $recent_urls_count = 0; $recent_urls = array(); for ($i = 0; $i < $num_seen; $i++) { $seen_sites[$i][self::HASH_URL] = crawlHash($seen_sites[$i][self::URL], true); $link_url_parts = explode("|", $seen_sites[$i][self::URL]); if (strcmp("url", $link_url_parts[0]) == 0) { $reftype = strcmp("eref", $link_url_parts[4]) == 0 ? "e" : "i"; $seen_sites[$i][self::HASH_URL] = crawlHash($link_url_parts[1], true) . crawlHash($seen_sites[$i][self::URL], true) . $reftype . substr(crawlHash(UrlParser::getHost($link_url_parts[5]) . "/", true), 1); $seen_sites[$i][self::IS_DOC] = false; } else { $seen_sites[$i][self::IS_DOC] = true; $visited_urls_count++; array_push($recent_urls, $seen_sites[$i][self::URL]); if ($recent_urls_count >= NUM_RECENT_URLS_TO_DISPLAY) { array_shift($recent_urls); } $recent_urls_count++; } } if (isset($sites[self::INVERTED_INDEX])) { $index_shard =& $sites[self::INVERTED_INDEX]; $generation = $this->index_archive->initGenerationToAdd($index_shard->num_docs, $this, $blocking); if ($generation == -1) { crawlLog("Indexer waiting for merge tiers to " . "complete before write partition. A"); $blocked = true; return; } $summary_offsets = array(); if (isset($seen_sites)) { $this->index_archive->addPages($generation, self::SUMMARY_OFFSET, $seen_sites, $visited_urls_count); foreach ($seen_sites as $site) { if ($site[self::IS_DOC]) { // so not link $site_url = str_replace('|', "%7C", $site[self::URL]); $host = UrlParser::getHost($site_url); $hash = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1); } else { $hash = $site[self::HASH_URL]; } $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET]; } unset($seen_sites); } crawlLog("C. Indexer init local shard, store " . "Summaries memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time)); $start_time = microtime(); // added summary offset info to inverted index data $index_shard->changeDocumentOffsets($summary_offsets); crawlLog("D. Indexer Update shard offsets. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time)); $start_time = microtime(); $this->index_archive->addIndexData($index_shard); $this->index_dirty = true; } crawlLog("E. Indexer Add index shard. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time)); crawlLog("Indexer Done Index Processing File: {$file}. Total time: " . changeInMicrotime($start_total_time)); if (isset($recent_urls)) { $sites[self::RECENT_URLS] =& $recent_urls; $this->writeCrawlStatus($sites); } if (file_exists($file)) { //Haven't tracked down yet, but can try to delete twice giving warn unlink($file); } }
/** * Get status, current crawl, crawl order, and new site information from * the queue_server. * * @return mixed array or bool. If we are doing * a web crawl and we still have pages to crawl then true, if the * scheduler page fails to download then false, otherwise, returns * an array of info from the scheduler. */ function checkScheduler() { $prefix = $this->fetcher_num . "-"; $info = array(); $to_crawl_count = count($this->to_crawl); $to_crawl_again_count = count($this->to_crawl_again); if ($this->recrawl_check_scheduler) { crawlLog("Archive Crawl checking ... Recrawl."); } if ((count($this->to_crawl) > 0 || count($this->to_crawl_again) > 0) && !$this->recrawl_check_scheduler) { crawlLog(" Current to crawl count:" . $to_crawl_count); crawlLog(" Current to crawl try again count:" . $to_crawl_again_count); crawlLog("So not checking scheduler."); return true; } $this->selectCurrentServerAndUpdateIfNeeded(false); $this->recrawl_check_scheduler = false; $queue_server = $this->queue_servers[$this->current_server]; crawlLog("Checking {$queue_server} for a new schedule."); // hosts with error counts cleared with each schedule $this->hosts_with_errors = array(); $start_time = microtime(); $time = time(); $session = md5($time . AUTH_KEY); $request = $queue_server . "?c=fetch&a=schedule&time={$time}&session={$session}" . "&robot_instance=" . $prefix . ROBOT_INSTANCE . "&machine_uri=" . WEB_URI . "&crawl_time=" . $this->crawl_time . "&check_crawl_time=" . $this->check_crawl_time; $info_string = FetchUrl::getPage($request, NULL, true); crawlLog("Making schedule request: " . $request); if ($info_string === false) { crawlLog("The request failed!!!!"); return false; } $info_string = trim($info_string); $tok = strtok($info_string, "\n"); $info = unserialize(base64_decode($tok)); $this->setCrawlParamsFromArray($info); if (isset($info[self::SITES])) { $tok = strtok("\n"); //skip meta info $this->to_crawl = array(); while ($tok !== false) { $string = base64_decode($tok); $weight = unpackFloat(substr($string, 0, 4)); $delay = unpackInt(substr($string, 4, 4)); $url = substr($string, 8); $this->to_crawl[] = array($url, $weight, $delay); $tok = strtok("\n"); } $dir = CRAWL_DIR . "/schedules"; file_put_contents("{$dir}/{$prefix}" . self::fetch_batch_name . "{$this->crawl_time}.txt", serialize($this->to_crawl)); $this->db->setWorldPermissionsRecursive("{$dir}/{$prefix}" . self::fetch_batch_name . "{$this->crawl_time}.txt"); unset($info[self::SITES]); file_put_contents("{$dir}/{$prefix}" . self::fetch_crawl_info . "{$this->crawl_time}.txt", serialize($info)); } crawlLog("Time to check Scheduler " . changeInMicrotime($start_time)); return $info; }
/** * Used to uncompress an int from a fixed length string in the format of * the compression algorithm underlying the compressor. Since this * compressor doesn't compress we just use unpack * * @param string $my_compressed_int the fixed length string containing * the packed int to extract * @return int the integer contained in that string */ function uncompressInt($my_compressed_int) { return unpackInt($my_compressed_int); }
/** * Checks if the given $url is allowed to be crawled based on stored * robots.txt info. * @param string $url to check * @return bool whether it was allowed or not */ function checkRobotOkay($url) { // local cache of recent robot.txt stuff static $robot_cache = array(); $cache_size = 2000; list($host, $path) = UrlParser::getHostAndPath($url, true, true); $path = urldecode($path); $key = crawlHash($host, true); if (isset($robot_cache[$key])) { $robot_object = $robot_cache[$key]; } else { $data = $this->robot_table->lookup($key); $offset = unpackInt($data); $robot_object = $this->robot_archive->getObjects($offset, 1); $robot_cache[$key] = $robot_object; if (count($robot_cache) > $cache_size) { array_shift($robot_cache); } } $robot_paths = isset($robot_object[0][1]) ? $robot_object[0][1] : array(); //these should have been urldecoded in RobotProcessor $robots_okay = true; $robots_not_okay = false; if (isset($robot_paths[CrawlConstants::DISALLOWED_SITES])) { $robots_not_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::DISALLOWED_SITES]); $robots_okay = !$robots_not_okay; } if (isset($robot_paths[CrawlConstants::ALLOWED_SITES])) { $robots_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::ALLOWED_SITES]); } return $robots_okay || !$robots_not_okay; }