/** * Get a list of urls from the current fetch batch provided by the queue * server. Then downloads these pages. Finally, reschedules, if * possible, pages that did not successfully get downloaded. * * @return array an associative array of web pages and meta data * fetched from the internet */ function downloadPagesWebCrawl() { $start_time = microtime(); $can_schedule_again = false; if (count($this->to_crawl) > 0) { $can_schedule_again = true; } $sites = $this->getFetchSites(); crawlLog("Done getting list of " . count($sites) . " to download..."); if (!$sites) { crawlLog("No seeds to fetch..."); sleep(max(0, ceil(MINIMUM_FETCH_LOOP_TIME - changeInMicrotime($start_time)))); return array(); } $prefix = $this->fetcher_num . "-"; $tmp_dir = CRAWL_DIR . "/{$prefix}temp"; $filtered_sites = array(); $site_pages = array(); foreach ($sites as $site) { $hard_coded_parts = explode("###!", $site[self::URL]); if (count($hard_coded_parts) > 1) { if (!isset($hard_coded_parts[2])) { $hard_coded_parts[2] = ""; } $site[self::URL] = $hard_coded_parts[0]; $title = urldecode($hard_coded_parts[1]); $description = urldecode($hard_coded_parts[2]); $site[self::PAGE] = "<html><head><title>{$title}" . "</title></head><body><h1>{$title}</h1>" . "<p>{$description}</p></body></html>"; $site[self::HTTP_CODE] = 200; $site[self::TYPE] = "text/html"; $site[self::ENCODING] = "UTF-8"; $site[self::IP_ADDRESSES] = array("0.0.0.0"); $site[self::TIMESTAMP] = time(); $site_pages[] = $site; } else { $filtered_sites[] = $site; } } $site_pages = array_merge($site_pages, FetchUrl::getPages($filtered_sites, true, $this->page_range_request, $tmp_dir, self::URL, self::PAGE, false, NULL, false, $this->tor_proxy, $this->proxy_servers)); crawlLog("..getPages call complete.."); for ($j = 0; $j < count($site_pages); $j++) { if (isset($site_pages[$j][self::REPOSITORY_TYPE])) { $git_repository_url = $site_pages[$j][self::URL]; $git_compressed_content = FetchGitRepositoryUrls::getGitdata($git_repository_url); $git_uncompressed_content = gzuncompress($git_compressed_content); $length = strlen($git_uncompressed_content); $git_hash_end = strpos($git_uncompressed_content, self::HEX_NULL_CHARACTER); $git_uncompressed_content = substr($git_uncompressed_content, $git_hash_end + 1, $length); $site_pages[$j][self::PAGE] = $git_uncompressed_content; $mime_type = UrlParser::guessMimeTypeFromFileName($site_pages[$j][self::FILE_NAME]); $site_pages[$j][self::TYPE] = $mime_type; } } list($downloaded_pages, $schedule_again_pages) = $this->reschedulePages($site_pages); if ($can_schedule_again == true) { //only schedule to crawl again on fail sites without crawl-delay crawlLog(" Scheduling again.."); foreach ($schedule_again_pages as $schedule_again_page) { if (isset($schedule_again_page[self::CRAWL_DELAY]) && $schedule_again_page[self::CRAWL_DELAY] == 0) { $this->to_crawl_again[] = array($schedule_again_page[self::URL], $schedule_again_page[self::WEIGHT], $schedule_again_page[self::CRAWL_DELAY]); } crawlLog("....reschedule count:" . count($this->to_crawl_again)); } crawlLog("....done."); } crawlLog("Downloading complete"); return $downloaded_pages; }
/** * Make multi_curl requests for an array of sites with urls or onion urls * * @param array $sites an array containing urls of pages to request * @param bool $timer flag, true means print timing statistics to log * @param int $page_range_request maximum number of bytes to download/page * 0 means download all * @param string $temp_dir folder to store temporary ip header info * @param string $key the component of $sites[$i] that has the value of * a url to get defaults to URL * @param string $value component of $sites[$i] in which to store the * page that was gotten * @param bool $minimal if true do a faster request of pages by not * doing things like extract HTTP headers sent, etcs * @param array $post_data data to be POST'd to each site * @param bool $follow whether to follow redirects or not * @param string $tor_proxy url of a proxy that knows how to download * .onion urls * @param array $proxy_servers if not array(), then an array of proxy * server to use rather than to directly download web pages from * the current machine * * @return array an updated array with the contents of those pages */ static function getPages($sites, $timer = false, $page_range_request = PAGE_RANGE_REQUEST, $temp_dir = NULL, $key = CrawlConstants::URL, $value = CrawlConstants::PAGE, $minimal = false, $post_data = NULL, $follow = false, $tor_proxy = "", $proxy_servers = array()) { $agent_handler = curl_multi_init(); $active = NULL; $start_time = microtime(); if (!$minimal && $temp_dir == NULL) { $temp_dir = CRAWL_DIR . "/temp"; if (!file_exists($temp_dir)) { mkdir($temp_dir); } } //Set-up requests $num_sites = count($sites); for ($i = 0; $i < $num_sites; $i++) { $is_gopher = false; $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher; if (isset($sites[$i][$key])) { list($sites[$i][$key], $url, $headers) = self::prepareUrlHeaders($sites[$i][$key], $minimal, $proxy_servers); if ($headers == "gopher") { $is_gopher = true; $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher; $headers = array(); } $sites[$i][0] = curl_init(); if (!$minimal) { $ip_holder[$i] = fopen("{$temp_dir}/tmp{$i}.txt", 'w+'); curl_setopt($sites[$i][0], CURLOPT_STDERR, $ip_holder[$i]); curl_setopt($sites[$i][0], CURLOPT_VERBOSE, true); } curl_setopt($sites[$i][0], CURLOPT_USERAGENT, USER_AGENT); curl_setopt($sites[$i][0], CURLOPT_IPRESOLVE, CURL_IPRESOLVE_WHATEVER); curl_setopt($sites[$i][0], CURLOPT_URL, $url); if (strcmp(substr($url, -10), "robots.txt") == 0) { $sites[$i]['ROBOT'] = true; $follow = true; /*wikipedia redirects their robot page. grr want to force this for robots pages */ } curl_setopt($sites[$i][0], CURLOPT_FOLLOWLOCATION, $follow); curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYHOST, 0); curl_setopt($sites[$i][0], CURLOPT_AUTOREFERER, true); curl_setopt($sites[$i][0], CURLOPT_RETURNTRANSFER, true); curl_setopt($sites[$i][0], CURLOPT_CONNECTTIMEOUT, PAGE_TIMEOUT); curl_setopt($sites[$i][0], CURLOPT_TIMEOUT, PAGE_TIMEOUT); if (stripos($url, '.onion') !== false && $tor_proxy != "") { curl_setopt($sites[$i][0], CURLOPT_PROXY, $tor_proxy); //CURLPROXY_SOCKS5_HOSTNAME = 7 curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, 7); if ($timer) { crawlLog("Using Tor proxy for {$url}.."); } } else { if ($proxy_servers != array() && !$is_gopher) { $select_proxy = rand(0, count($proxy_servers) - 1); $proxy_server = $proxy_servers[$select_proxy]; $proxy_parts = explode(":", $proxy_server); $proxy_ip = $proxy_parts[0]; if (!isset($proxy_parts[2]) || strtolower($proxy_parts[2]) == 'http') { $proxy_type = CURLPROXY_HTTP; } else { if (strtolower($proxy_parts[2]) == 'socks5') { $proxy_type = CURLPROXY_SOCKS5; } else { $proxy_type = $proxy_parts[2]; } } if (isset($proxy_parts[1])) { $proxy_port = $proxy_parts[1]; } else { $proxy_port = "80"; } curl_setopt($sites[$i][0], CURLOPT_PROXY, "{$proxy_ip}:{$proxy_port}"); curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, $proxy_type); if ($timer) { crawlLog("Selecting proxy {$select_proxy} for {$url}"); } } } if (!$minimal) { curl_setopt($sites[$i][0], CURLOPT_HEADER, true); } //make lighttpd happier if (!$is_gopher) { curl_setopt($sites[$i][0], CURLOPT_HTTPHEADER, $headers); } curl_setopt($sites[$i][0], CURLOPT_ENCODING, ""); // ^ need to set for sites like att that use gzip if ($page_range_request > 0) { curl_setopt($sites[$i][0], CURLOPT_RANGE, "0-" . $page_range_request); } if ($post_data != NULL) { curl_setopt($sites[$i][0], CURLOPT_POST, true); curl_setopt($sites[$i][0], CURLOPT_POSTFIELDS, $post_data[$i]); } curl_multi_add_handle($agent_handler, $sites[$i][0]); } } if ($timer) { crawlLog(" Init Get Pages " . changeInMicrotime($start_time)); } $start_time = microtime(); $start = time(); //Wait for responses $running = NULL; $memory_limit = metricToInt(ini_get("memory_limit")) * 0.7; do { $mrc = curl_multi_exec($agent_handler, $running); $ready = curl_multi_select($agent_handler, 0.005); } while (memory_get_usage() < $memory_limit && time() - $start < PAGE_TIMEOUT && $running > 0); if (time() - $start > PAGE_TIMEOUT && $timer) { crawlLog(" TIMED OUT!!!"); } if ($timer) { crawlLog(" Page Request time " . changeInMicrotime($start_time)); } $start_time = microtime(); //Process returned pages for ($i = 0; $i < $num_sites; $i++) { if ($timer) { crawlTimeoutLog("fetch_url initial processing of page %s of %s", $i, $num_sites); } if (!$minimal && isset($ip_holder[$i])) { rewind($ip_holder[$i]); $header = fread($ip_holder[$i], 8192); $ip_addresses = self::getCurlIp($header); fclose($ip_holder[$i]); } $is_gopher = false; if (isset($sites[$i][0]) && $sites[$i][0]) { // Get Data and Message Code $content = @curl_multi_getcontent($sites[$i][0]); $is_gopher = $sites[$i][CrawlConstants::IS_GOPHER_URL]; /* If the Transfer-encoding was chunked then the Range header we sent was ignored. So we manually truncate the data here */ if ($page_range_request > 0) { $content = substr($content, 0, $page_range_request); } if (isset($content) && !$minimal && !$is_gopher) { $site = self::parseHeaderPage($content, $value); $sites[$i] = array_merge($sites[$i], $site); if (isset($header)) { $header = substr($header, 0, strpos($header, "\r\n\r\n") + 4); } else { $header = ""; } $sites[$i][CrawlConstants::HEADER] = $header . $sites[$i][CrawlConstants::HEADER]; unset($header); } else { if (isset($content) && !$minimal && $is_gopher) { $sites[$i][CrawlConstants::HEADER] = $header; $sites[$i][$value] = $content; unset($header); } else { $sites[$i][$value] = $content; } } if (!$minimal) { $sites[$i][self::SIZE] = @curl_getinfo($sites[$i][0], CURLINFO_SIZE_DOWNLOAD); $sites[$i][self::DNS_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_NAMELOOKUP_TIME); $sites[$i][self::TOTAL_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_TOTAL_TIME); $sites[$i][self::HTTP_CODE] = curl_getinfo($sites[$i][0], CURLINFO_HTTP_CODE); if (!$sites[$i][self::HTTP_CODE] && !$is_gopher) { $sites[$i][self::HTTP_CODE] = curl_error($sites[$i][0]); } else { $sites[$i][self::HTTP_CODE] = 200; } if ($ip_addresses) { $sites[$i][self::IP_ADDRESSES] = $ip_addresses; } else { $sites[$i][self::IP_ADDRESSES] = array("0.0.0.0"); } //Get Time, Mime type and Character encoding $sites[$i][self::TIMESTAMP] = time(); if ($is_gopher) { $path = UrlParser::getPath($sites[$i][self::URL]); $filename = UrlParser::getDocumentFilename($sites[$i][self::URL]); if (isset($path[1])) { $gopher_type = $path[1]; } else { $gopher_type = 1; } if ($gopher_type == 1) { $sites[$i][self::TYPE] = "text/gopher"; } else { if (in_array($gopher_type, array(0, 3, 6))) { $sites[$i][self::TYPE] = "text/plain"; if ($gopher_type == 6) { $sites[$i][$value] = convert_uudecode($content); } } else { if ($gopher_type == 'h') { $sites[$i][self::TYPE] = "text/html"; } else { if ($gopher_type == 'g') { $sites[$i][self::TYPE] = "image/gif"; } } } } $path_info = pathinfo($filename); if (!isset($sites[$i][self::TYPE]) && isset($path_info['extension'])) { $sites[$i][self::TYPE] = UrlParser::guessMimeTypeFromFileName($filename); } else { if (!isset($sites[$i][self::TYPE])) { $sites[$i][self::TYPE] = "unknown"; } } } else { $type_parts = explode(";", curl_getinfo($sites[$i][0], CURLINFO_CONTENT_TYPE)); $sites[$i][self::TYPE] = strtolower(trim($type_parts[0])); } } //curl_multi_remove_handle($agent_handler, $sites[$i][0]); curl_close($sites[$i][0]); if (isset($sites[$i]['ROBOT']) && $sites[$i]['ROBOT']) { if (isset($sites[$i][self::TYPE]) && $sites[$i][self::TYPE] != "text/plain" && isset($sites[$i][CrawlConstants::LOCATION]) && count($site[CrawlConstants::LOCATION]) > 0) { $sites[$i][self::TYPE] = "text/plain"; $sites[$i][self::HTTP_CODE] = "200"; $tmp = wordwrap($sites[$i][$value], 80); $tmp_parts = explode("\n", $tmp); $tmp = "# Suspect server misconfiguration\n"; $tmp .= "# Assume shouldn't crawl this site.\n"; $tmp .= "# Pretending got following robots.txt.\n"; $tmp .= "User-agent: *\n"; $tmp .= "Disallow: /\n"; $tmp .= "# Original error code: " . $sites[$i][self::HTTP_CODE] . "\n"; $tmp .= "# Original content:\n"; foreach ($tmp_parts as $part) { $tmp = "#" . $part . "\n"; } $sites[$i][$value] = $tmp; $sites[$i][self::HTTP_CODE] = "200"; unset($site[CrawlConstants::LOCATION]); } } } //end big if } //end for if ($timer) { crawlLog(" Get Page Content time " . changeInMicrotime($start_time)); } curl_multi_close($agent_handler); return $sites; }