/** * {@inheritDoc} * * @param string $page the image represented as a character string * @param string $url the url where the image was downloaded from * @return array summary information including a thumbnail and a * description (where the description is just the url) */ function process($page, $url) { if (is_string($page)) { $image = $this->imagecreatefrombmp($page); $thumb_string = self::createThumb($image); $summary[self::TITLE] = ""; $summary[self::DESCRIPTION] = "Image of " . UrlParser::getDocumentFilename($url); $summary[self::LINKS] = array(); $summary[self::PAGE] = "<html><body><div><img src='data:image/bmp;base64," . base64_encode($page) . "' alt='" . $summary[self::DESCRIPTION] . "' /></div></body></html>"; $summary[self::THUMB] = 'data:image/jpeg;base64,' . base64_encode($thumb_string); } return $summary; }
/** * Prepare an array of up to NUM_MULTI_CURL_PAGES' worth of sites to be * downloaded in one go using the to_crawl array. Delete these sites * from the to_crawl array. * * @return array sites which are ready to be downloaded */ function getFetchSites() { $web_archive = $this->web_archive; $start_time = microtime(); $seeds = array(); $delete_indices = array(); $num_items = count($this->to_crawl); if ($num_items > 0) { $crawl_source =& $this->to_crawl; $to_crawl_flag = true; } else { crawlLog("...Trying to crawl sites which failed the first time"); $num_items = count($this->to_crawl_again); $crawl_source =& $this->to_crawl_again; $to_crawl_flag = false; } reset($crawl_source); if ($num_items > NUM_MULTI_CURL_PAGES) { $num_items = NUM_MULTI_CURL_PAGES; } //DNS lookups take longer so try to get fewer in one go $num_ip_lookups = max($num_items / 3, 2); $i = 0; $ip_lookup_cnt = 0; $site_pair = each($crawl_source); while ($site_pair !== false && $i < $num_items && $ip_lookup_cnt < $num_ip_lookups) { $delete_indices[] = $site_pair['key']; if ($site_pair['value'][0] != self::DUMMY) { $host = UrlParser::getHost($site_pair['value'][0]); if (!strpos($site_pair['value'][0], "###")) { $ip_lookup_cnt++; } // only download if host doesn't seem congested if (!isset($this->hosts_with_errors[$host]) || $this->hosts_with_errors[$host] < DOWNLOAD_ERROR_THRESHOLD) { $url_to_check = $site_pair['value'][0]; $extension = UrlParser::getDocumentType($url_to_check); $repository_indicator = FetchGitRepositoryUrls::checkForRepository($extension); if ($repository_indicator == self::REPOSITORY_GIT) { $git_internal_urls = FetchGitRepositoryUrls::setGitRepositoryUrl($url_to_check, $i, $seeds, $repository_indicator, $site_pair, $this->total_git_urls, $this->all_git_urls); $i = $git_internal_urls['position']; $git_url_index = $git_internal_urls['index']; $seeds = $git_internal_urls['seeds']; $repository_indicator = $git_internal_urls['indicator']; $this->total_git_urls = $git_internal_urls['count']; $this->all_git_urls = $git_internal_urls['all']; } else { $seeds[$i][self::URL] = $site_pair['value'][0]; $seeds[$i][self::WEIGHT] = $site_pair['value'][1]; $seeds[$i][self::CRAWL_DELAY] = $site_pair['value'][2]; } /* Crawl delay is only used in scheduling on the queue_server on the fetcher, we only use crawl-delay to determine if we will give a page a second try if it doesn't download the first time */ if (UrlParser::getDocumentFilename($seeds[$i][self::URL]) . "." . UrlParser::getDocumentType($seeds[$i][self::URL]) == "robots.txt") { $seeds[$i][self::ROBOT_PATHS] = array(); } $i++; } } else { break; } $site_pair = each($crawl_source); } //end while foreach ($delete_indices as $delete_index) { $git_set = false; if ($to_crawl_flag == true) { $extension = UrlParser::getDocumentType($this->to_crawl[$delete_index][0]); $repository_type = FetchGitRepositoryUrls::checkForRepository($extension); if ($repository_type != self::REPOSITORY_GIT) { unset($this->to_crawl[$delete_index]); } } else { $extension = UrlParser::getDocumentType($this->to_crawl_again[$delete_index][0]); $repository_type = FetchGitRepositoryUrls::checkForRepository($extension); unset($this->to_crawl_again[$delete_index]); } if ($repository_type == self::REPOSITORY_GIT) { if (!$git_set) { $next_url_start = $url_to_check . self::GIT_URL_CONTINUE . $git_url_index; $git_set = true; $this->to_crawl[$delete_index][0] = $next_url_start; } if ($repository_indicator == self::INDICATOR_NONE) { unset($this->to_crawl[$delete_index]); } } } crawlLog("Fetch url list to download time " . changeInMicrotime($start_time)); return $seeds; }
/** * Make multi_curl requests for an array of sites with urls or onion urls * * @param array $sites an array containing urls of pages to request * @param bool $timer flag, true means print timing statistics to log * @param int $page_range_request maximum number of bytes to download/page * 0 means download all * @param string $temp_dir folder to store temporary ip header info * @param string $key the component of $sites[$i] that has the value of * a url to get defaults to URL * @param string $value component of $sites[$i] in which to store the * page that was gotten * @param bool $minimal if true do a faster request of pages by not * doing things like extract HTTP headers sent, etcs * @param array $post_data data to be POST'd to each site * @param bool $follow whether to follow redirects or not * @param string $tor_proxy url of a proxy that knows how to download * .onion urls * @param array $proxy_servers if not array(), then an array of proxy * server to use rather than to directly download web pages from * the current machine * * @return array an updated array with the contents of those pages */ static function getPages($sites, $timer = false, $page_range_request = PAGE_RANGE_REQUEST, $temp_dir = NULL, $key = CrawlConstants::URL, $value = CrawlConstants::PAGE, $minimal = false, $post_data = NULL, $follow = false, $tor_proxy = "", $proxy_servers = array()) { $agent_handler = curl_multi_init(); $active = NULL; $start_time = microtime(); if (!$minimal && $temp_dir == NULL) { $temp_dir = CRAWL_DIR . "/temp"; if (!file_exists($temp_dir)) { mkdir($temp_dir); } } //Set-up requests $num_sites = count($sites); for ($i = 0; $i < $num_sites; $i++) { $is_gopher = false; $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher; if (isset($sites[$i][$key])) { list($sites[$i][$key], $url, $headers) = self::prepareUrlHeaders($sites[$i][$key], $minimal, $proxy_servers); if ($headers == "gopher") { $is_gopher = true; $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher; $headers = array(); } $sites[$i][0] = curl_init(); if (!$minimal) { $ip_holder[$i] = fopen("{$temp_dir}/tmp{$i}.txt", 'w+'); curl_setopt($sites[$i][0], CURLOPT_STDERR, $ip_holder[$i]); curl_setopt($sites[$i][0], CURLOPT_VERBOSE, true); } curl_setopt($sites[$i][0], CURLOPT_USERAGENT, USER_AGENT); curl_setopt($sites[$i][0], CURLOPT_IPRESOLVE, CURL_IPRESOLVE_WHATEVER); curl_setopt($sites[$i][0], CURLOPT_URL, $url); if (strcmp(substr($url, -10), "robots.txt") == 0) { $sites[$i]['ROBOT'] = true; $follow = true; /*wikipedia redirects their robot page. grr want to force this for robots pages */ } curl_setopt($sites[$i][0], CURLOPT_FOLLOWLOCATION, $follow); curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYHOST, 0); curl_setopt($sites[$i][0], CURLOPT_AUTOREFERER, true); curl_setopt($sites[$i][0], CURLOPT_RETURNTRANSFER, true); curl_setopt($sites[$i][0], CURLOPT_CONNECTTIMEOUT, PAGE_TIMEOUT); curl_setopt($sites[$i][0], CURLOPT_TIMEOUT, PAGE_TIMEOUT); if (stripos($url, '.onion') !== false && $tor_proxy != "") { curl_setopt($sites[$i][0], CURLOPT_PROXY, $tor_proxy); //CURLPROXY_SOCKS5_HOSTNAME = 7 curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, 7); if ($timer) { crawlLog("Using Tor proxy for {$url}.."); } } else { if ($proxy_servers != array() && !$is_gopher) { $select_proxy = rand(0, count($proxy_servers) - 1); $proxy_server = $proxy_servers[$select_proxy]; $proxy_parts = explode(":", $proxy_server); $proxy_ip = $proxy_parts[0]; if (!isset($proxy_parts[2]) || strtolower($proxy_parts[2]) == 'http') { $proxy_type = CURLPROXY_HTTP; } else { if (strtolower($proxy_parts[2]) == 'socks5') { $proxy_type = CURLPROXY_SOCKS5; } else { $proxy_type = $proxy_parts[2]; } } if (isset($proxy_parts[1])) { $proxy_port = $proxy_parts[1]; } else { $proxy_port = "80"; } curl_setopt($sites[$i][0], CURLOPT_PROXY, "{$proxy_ip}:{$proxy_port}"); curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, $proxy_type); if ($timer) { crawlLog("Selecting proxy {$select_proxy} for {$url}"); } } } if (!$minimal) { curl_setopt($sites[$i][0], CURLOPT_HEADER, true); } //make lighttpd happier if (!$is_gopher) { curl_setopt($sites[$i][0], CURLOPT_HTTPHEADER, $headers); } curl_setopt($sites[$i][0], CURLOPT_ENCODING, ""); // ^ need to set for sites like att that use gzip if ($page_range_request > 0) { curl_setopt($sites[$i][0], CURLOPT_RANGE, "0-" . $page_range_request); } if ($post_data != NULL) { curl_setopt($sites[$i][0], CURLOPT_POST, true); curl_setopt($sites[$i][0], CURLOPT_POSTFIELDS, $post_data[$i]); } curl_multi_add_handle($agent_handler, $sites[$i][0]); } } if ($timer) { crawlLog(" Init Get Pages " . changeInMicrotime($start_time)); } $start_time = microtime(); $start = time(); //Wait for responses $running = NULL; $memory_limit = metricToInt(ini_get("memory_limit")) * 0.7; do { $mrc = curl_multi_exec($agent_handler, $running); $ready = curl_multi_select($agent_handler, 0.005); } while (memory_get_usage() < $memory_limit && time() - $start < PAGE_TIMEOUT && $running > 0); if (time() - $start > PAGE_TIMEOUT && $timer) { crawlLog(" TIMED OUT!!!"); } if ($timer) { crawlLog(" Page Request time " . changeInMicrotime($start_time)); } $start_time = microtime(); //Process returned pages for ($i = 0; $i < $num_sites; $i++) { if ($timer) { crawlTimeoutLog("fetch_url initial processing of page %s of %s", $i, $num_sites); } if (!$minimal && isset($ip_holder[$i])) { rewind($ip_holder[$i]); $header = fread($ip_holder[$i], 8192); $ip_addresses = self::getCurlIp($header); fclose($ip_holder[$i]); } $is_gopher = false; if (isset($sites[$i][0]) && $sites[$i][0]) { // Get Data and Message Code $content = @curl_multi_getcontent($sites[$i][0]); $is_gopher = $sites[$i][CrawlConstants::IS_GOPHER_URL]; /* If the Transfer-encoding was chunked then the Range header we sent was ignored. So we manually truncate the data here */ if ($page_range_request > 0) { $content = substr($content, 0, $page_range_request); } if (isset($content) && !$minimal && !$is_gopher) { $site = self::parseHeaderPage($content, $value); $sites[$i] = array_merge($sites[$i], $site); if (isset($header)) { $header = substr($header, 0, strpos($header, "\r\n\r\n") + 4); } else { $header = ""; } $sites[$i][CrawlConstants::HEADER] = $header . $sites[$i][CrawlConstants::HEADER]; unset($header); } else { if (isset($content) && !$minimal && $is_gopher) { $sites[$i][CrawlConstants::HEADER] = $header; $sites[$i][$value] = $content; unset($header); } else { $sites[$i][$value] = $content; } } if (!$minimal) { $sites[$i][self::SIZE] = @curl_getinfo($sites[$i][0], CURLINFO_SIZE_DOWNLOAD); $sites[$i][self::DNS_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_NAMELOOKUP_TIME); $sites[$i][self::TOTAL_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_TOTAL_TIME); $sites[$i][self::HTTP_CODE] = curl_getinfo($sites[$i][0], CURLINFO_HTTP_CODE); if (!$sites[$i][self::HTTP_CODE] && !$is_gopher) { $sites[$i][self::HTTP_CODE] = curl_error($sites[$i][0]); } else { $sites[$i][self::HTTP_CODE] = 200; } if ($ip_addresses) { $sites[$i][self::IP_ADDRESSES] = $ip_addresses; } else { $sites[$i][self::IP_ADDRESSES] = array("0.0.0.0"); } //Get Time, Mime type and Character encoding $sites[$i][self::TIMESTAMP] = time(); if ($is_gopher) { $path = UrlParser::getPath($sites[$i][self::URL]); $filename = UrlParser::getDocumentFilename($sites[$i][self::URL]); if (isset($path[1])) { $gopher_type = $path[1]; } else { $gopher_type = 1; } if ($gopher_type == 1) { $sites[$i][self::TYPE] = "text/gopher"; } else { if (in_array($gopher_type, array(0, 3, 6))) { $sites[$i][self::TYPE] = "text/plain"; if ($gopher_type == 6) { $sites[$i][$value] = convert_uudecode($content); } } else { if ($gopher_type == 'h') { $sites[$i][self::TYPE] = "text/html"; } else { if ($gopher_type == 'g') { $sites[$i][self::TYPE] = "image/gif"; } } } } $path_info = pathinfo($filename); if (!isset($sites[$i][self::TYPE]) && isset($path_info['extension'])) { $sites[$i][self::TYPE] = UrlParser::guessMimeTypeFromFileName($filename); } else { if (!isset($sites[$i][self::TYPE])) { $sites[$i][self::TYPE] = "unknown"; } } } else { $type_parts = explode(";", curl_getinfo($sites[$i][0], CURLINFO_CONTENT_TYPE)); $sites[$i][self::TYPE] = strtolower(trim($type_parts[0])); } } //curl_multi_remove_handle($agent_handler, $sites[$i][0]); curl_close($sites[$i][0]); if (isset($sites[$i]['ROBOT']) && $sites[$i]['ROBOT']) { if (isset($sites[$i][self::TYPE]) && $sites[$i][self::TYPE] != "text/plain" && isset($sites[$i][CrawlConstants::LOCATION]) && count($site[CrawlConstants::LOCATION]) > 0) { $sites[$i][self::TYPE] = "text/plain"; $sites[$i][self::HTTP_CODE] = "200"; $tmp = wordwrap($sites[$i][$value], 80); $tmp_parts = explode("\n", $tmp); $tmp = "# Suspect server misconfiguration\n"; $tmp .= "# Assume shouldn't crawl this site.\n"; $tmp .= "# Pretending got following robots.txt.\n"; $tmp .= "User-agent: *\n"; $tmp .= "Disallow: /\n"; $tmp .= "# Original error code: " . $sites[$i][self::HTTP_CODE] . "\n"; $tmp .= "# Original content:\n"; foreach ($tmp_parts as $part) { $tmp = "#" . $part . "\n"; } $sites[$i][$value] = $tmp; $sites[$i][self::HTTP_CODE] = "200"; unset($site[CrawlConstants::LOCATION]); } } } //end big if } //end for if ($timer) { crawlLog(" Get Page Content time " . changeInMicrotime($start_time)); } curl_multi_close($agent_handler); return $sites; }
/** * Used to recompute the dictionary of an index archive -- either from * scratch using the index shard data or just using the current dictionary * but merging the tiers into one tier * * @param string $path file path to dictionary of an IndexArchiveBundle * @param int $max_tier tier up to which the dictionary tiers should be * merge (typically a value greater than the max_tier of the * dictionary) */ function reindexIndexArchive($path, $max_tier = -1) { if ($this->getArchiveKind($path) != "IndexArchiveBundle") { echo "\n{$path} ...\n" . " is not an IndexArchiveBundle so cannot be re-indexed\n\n"; exit; } $shards = glob($path . "/posting_doc_shards/index*"); if (is_array($shards)) { if ($max_tier == -1) { $dbms_manager = DBMS . "Manager"; $db = new $dbms_manager(); $db->unlinkRecursive($path . "/dictionary", false); IndexDictionary::makePrefixLetters($path . "/dictionary"); } $dictionary = new IndexDictionary($path . "/dictionary"); if ($max_tier == -1) { $max_generation = 0; foreach ($shards as $shard_name) { $file_name = UrlParser::getDocumentFilename($shard_name); $generation = (int) substr($file_name, strlen("index")); $max_generation = max($max_generation, $generation); } for ($i = 0; $i < $max_generation + 1; $i++) { $shard_name = $path . "/posting_doc_shards/index{$i}"; echo "\nShard {$i}\n"; $shard = new IndexShard($shard_name, $i, NUM_DOCS_PER_GENERATION, true); $dictionary->addShardDictionary($shard); } $max_tier = $dictionary->max_tier; } echo "\nFinal Merge Tiers\n"; $dictionary->mergeAllTiers(NULL, $max_tier); $db->setWorldPermissionsRecursive($path . "/dictionary"); echo "\nReindex complete!!\n"; } else { echo "\n{$path} ...\n" . " does not contain posting shards so cannot be re-indexed\n\n"; } }
/** * Gets the resource $_REQUEST['n'] from APP_DIR/$_REQUEST['f'] or * CRAWL_DIR/$_REQUEST['f'] after cleaning */ function get() { if (!isset($_REQUEST['n']) || !isset($_REQUEST['f'])) { return; } $name = $this->clean($_REQUEST['n'], "string"); if (in_array($_REQUEST['f'], array("css", "scripts", "resources"))) { /* notice in this case we don't check if request come from a legitimate source but we do try to restrict it to being a file (not a folder) in the above array */ $base_dir = $this->getBaseFolder(); if (!$base_dir) { header('HTTP/1.1 401 Unauthorized'); echo "<html><head><title>401 Unauthorized</title></head>" . "<body><p>401 Unauthorized</p></body></html>"; return; } $type = UrlParser::getDocumentType($name); $name = UrlParser::getDocumentFilename($name); $name = $type != "" ? "{$name}.{$type}" : $name; if (isset($_REQUEST['t'])) { $name .= ".jpg"; } } else { if (in_array($_REQUEST['f'], array("cache"))) { /* perform check since these request should come from a known machine */ if (!$this->checkRequest()) { return; } $folder = $_REQUEST['f']; $base_dir = CRAWL_DIR . "/{$folder}"; } else { return; } } if (isset($_REQUEST['o']) && isset($_REQUEST['l'])) { $offset = $this->clean($_REQUEST['o'], "int"); $limit = $this->clean($_REQUEST['l'], "int"); } $path = "{$base_dir}/{$name}"; if (file_exists($path)) { $mime_type = mimeType($path); $size = filesize($path); $start = 0; $end = $size - 1; header("Content-type: {$mime_type}"); header("Accept-Ranges: bytes"); if (isset($_SERVER['HTTP_RANGE'])) { $this->serveRangeRequest($path, $size, $start, $end); return; } header("Content-Length: " . $size); header("Content-Range: bytes {$start}-{$end}/{$size}"); if (isset($offset) && isset($limit)) { echo file_get_contents($path, false, NULL, $offset, $limit); } else { readfile($path); } } else { header("Location:./error.php"); } }