Esempio n. 1
0
 /**
  * {@inheritDoc}
  *
  * @param string $page  the image represented as a character string
  * @param string $url  the url where the image was downloaded from
  * @return array summary information including a thumbnail and a
  *     description (where the description is just the url)
  */
 function process($page, $url)
 {
     if (is_string($page)) {
         $image = $this->imagecreatefrombmp($page);
         $thumb_string = self::createThumb($image);
         $summary[self::TITLE] = "";
         $summary[self::DESCRIPTION] = "Image of " . UrlParser::getDocumentFilename($url);
         $summary[self::LINKS] = array();
         $summary[self::PAGE] = "<html><body><div><img src='data:image/bmp;base64," . base64_encode($page) . "' alt='" . $summary[self::DESCRIPTION] . "' /></div></body></html>";
         $summary[self::THUMB] = 'data:image/jpeg;base64,' . base64_encode($thumb_string);
     }
     return $summary;
 }
Esempio n. 2
0
 /**
  * Prepare an array of up to NUM_MULTI_CURL_PAGES' worth of sites to be
  * downloaded in one go using the to_crawl array. Delete these sites
  * from the to_crawl array.
  *
  * @return array sites which are ready to be downloaded
  */
 function getFetchSites()
 {
     $web_archive = $this->web_archive;
     $start_time = microtime();
     $seeds = array();
     $delete_indices = array();
     $num_items = count($this->to_crawl);
     if ($num_items > 0) {
         $crawl_source =& $this->to_crawl;
         $to_crawl_flag = true;
     } else {
         crawlLog("...Trying to crawl sites which failed the first time");
         $num_items = count($this->to_crawl_again);
         $crawl_source =& $this->to_crawl_again;
         $to_crawl_flag = false;
     }
     reset($crawl_source);
     if ($num_items > NUM_MULTI_CURL_PAGES) {
         $num_items = NUM_MULTI_CURL_PAGES;
     }
     //DNS lookups take longer so try to get fewer in one go
     $num_ip_lookups = max($num_items / 3, 2);
     $i = 0;
     $ip_lookup_cnt = 0;
     $site_pair = each($crawl_source);
     while ($site_pair !== false && $i < $num_items && $ip_lookup_cnt < $num_ip_lookups) {
         $delete_indices[] = $site_pair['key'];
         if ($site_pair['value'][0] != self::DUMMY) {
             $host = UrlParser::getHost($site_pair['value'][0]);
             if (!strpos($site_pair['value'][0], "###")) {
                 $ip_lookup_cnt++;
             }
             // only download if host doesn't seem congested
             if (!isset($this->hosts_with_errors[$host]) || $this->hosts_with_errors[$host] < DOWNLOAD_ERROR_THRESHOLD) {
                 $url_to_check = $site_pair['value'][0];
                 $extension = UrlParser::getDocumentType($url_to_check);
                 $repository_indicator = FetchGitRepositoryUrls::checkForRepository($extension);
                 if ($repository_indicator == self::REPOSITORY_GIT) {
                     $git_internal_urls = FetchGitRepositoryUrls::setGitRepositoryUrl($url_to_check, $i, $seeds, $repository_indicator, $site_pair, $this->total_git_urls, $this->all_git_urls);
                     $i = $git_internal_urls['position'];
                     $git_url_index = $git_internal_urls['index'];
                     $seeds = $git_internal_urls['seeds'];
                     $repository_indicator = $git_internal_urls['indicator'];
                     $this->total_git_urls = $git_internal_urls['count'];
                     $this->all_git_urls = $git_internal_urls['all'];
                 } else {
                     $seeds[$i][self::URL] = $site_pair['value'][0];
                     $seeds[$i][self::WEIGHT] = $site_pair['value'][1];
                     $seeds[$i][self::CRAWL_DELAY] = $site_pair['value'][2];
                 }
                 /*
                   Crawl delay is only used in scheduling on the queue_server
                   on the fetcher, we only use crawl-delay to determine
                   if we will give a page a second try if it doesn't
                   download the first time
                 */
                 if (UrlParser::getDocumentFilename($seeds[$i][self::URL]) . "." . UrlParser::getDocumentType($seeds[$i][self::URL]) == "robots.txt") {
                     $seeds[$i][self::ROBOT_PATHS] = array();
                 }
                 $i++;
             }
         } else {
             break;
         }
         $site_pair = each($crawl_source);
     }
     //end while
     foreach ($delete_indices as $delete_index) {
         $git_set = false;
         if ($to_crawl_flag == true) {
             $extension = UrlParser::getDocumentType($this->to_crawl[$delete_index][0]);
             $repository_type = FetchGitRepositoryUrls::checkForRepository($extension);
             if ($repository_type != self::REPOSITORY_GIT) {
                 unset($this->to_crawl[$delete_index]);
             }
         } else {
             $extension = UrlParser::getDocumentType($this->to_crawl_again[$delete_index][0]);
             $repository_type = FetchGitRepositoryUrls::checkForRepository($extension);
             unset($this->to_crawl_again[$delete_index]);
         }
         if ($repository_type == self::REPOSITORY_GIT) {
             if (!$git_set) {
                 $next_url_start = $url_to_check . self::GIT_URL_CONTINUE . $git_url_index;
                 $git_set = true;
                 $this->to_crawl[$delete_index][0] = $next_url_start;
             }
             if ($repository_indicator == self::INDICATOR_NONE) {
                 unset($this->to_crawl[$delete_index]);
             }
         }
     }
     crawlLog("Fetch url list to download time " . changeInMicrotime($start_time));
     return $seeds;
 }
Esempio n. 3
0
 /**
  * Make multi_curl requests for an array of sites with urls or onion urls
  *
  * @param array $sites  an array containing urls of pages to request
  * @param bool $timer  flag, true means print timing statistics to log
  * @param int $page_range_request maximum number of bytes to download/page
  *     0 means download all
  * @param string $temp_dir folder to store temporary ip header info
  * @param string $key  the component of $sites[$i] that has the value of
  *     a url to get defaults to URL
  * @param string $value component of $sites[$i] in which to store the
  *     page that was gotten
  * @param bool $minimal if true do a faster request of pages by not
  *     doing things like extract HTTP headers sent, etcs
  * @param array $post_data data to be POST'd to each site
  * @param bool $follow whether to follow redirects or not
  * @param string $tor_proxy url of a proxy that knows how to download
  *     .onion urls
  * @param array $proxy_servers if not array(), then an array of proxy
  *     server to use rather than to directly download web pages from
  *     the current machine
  *
  * @return array an updated array with the contents of those pages
  */
 static function getPages($sites, $timer = false, $page_range_request = PAGE_RANGE_REQUEST, $temp_dir = NULL, $key = CrawlConstants::URL, $value = CrawlConstants::PAGE, $minimal = false, $post_data = NULL, $follow = false, $tor_proxy = "", $proxy_servers = array())
 {
     $agent_handler = curl_multi_init();
     $active = NULL;
     $start_time = microtime();
     if (!$minimal && $temp_dir == NULL) {
         $temp_dir = CRAWL_DIR . "/temp";
         if (!file_exists($temp_dir)) {
             mkdir($temp_dir);
         }
     }
     //Set-up requests
     $num_sites = count($sites);
     for ($i = 0; $i < $num_sites; $i++) {
         $is_gopher = false;
         $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher;
         if (isset($sites[$i][$key])) {
             list($sites[$i][$key], $url, $headers) = self::prepareUrlHeaders($sites[$i][$key], $minimal, $proxy_servers);
             if ($headers == "gopher") {
                 $is_gopher = true;
                 $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher;
                 $headers = array();
             }
             $sites[$i][0] = curl_init();
             if (!$minimal) {
                 $ip_holder[$i] = fopen("{$temp_dir}/tmp{$i}.txt", 'w+');
                 curl_setopt($sites[$i][0], CURLOPT_STDERR, $ip_holder[$i]);
                 curl_setopt($sites[$i][0], CURLOPT_VERBOSE, true);
             }
             curl_setopt($sites[$i][0], CURLOPT_USERAGENT, USER_AGENT);
             curl_setopt($sites[$i][0], CURLOPT_IPRESOLVE, CURL_IPRESOLVE_WHATEVER);
             curl_setopt($sites[$i][0], CURLOPT_URL, $url);
             if (strcmp(substr($url, -10), "robots.txt") == 0) {
                 $sites[$i]['ROBOT'] = true;
                 $follow = true;
                 /*wikipedia redirects their robot page. grr
                     want to force this for robots pages
                   */
             }
             curl_setopt($sites[$i][0], CURLOPT_FOLLOWLOCATION, $follow);
             curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYHOST, 0);
             curl_setopt($sites[$i][0], CURLOPT_AUTOREFERER, true);
             curl_setopt($sites[$i][0], CURLOPT_RETURNTRANSFER, true);
             curl_setopt($sites[$i][0], CURLOPT_CONNECTTIMEOUT, PAGE_TIMEOUT);
             curl_setopt($sites[$i][0], CURLOPT_TIMEOUT, PAGE_TIMEOUT);
             if (stripos($url, '.onion') !== false && $tor_proxy != "") {
                 curl_setopt($sites[$i][0], CURLOPT_PROXY, $tor_proxy);
                 //CURLPROXY_SOCKS5_HOSTNAME = 7
                 curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, 7);
                 if ($timer) {
                     crawlLog("Using Tor proxy for {$url}..");
                 }
             } else {
                 if ($proxy_servers != array() && !$is_gopher) {
                     $select_proxy = rand(0, count($proxy_servers) - 1);
                     $proxy_server = $proxy_servers[$select_proxy];
                     $proxy_parts = explode(":", $proxy_server);
                     $proxy_ip = $proxy_parts[0];
                     if (!isset($proxy_parts[2]) || strtolower($proxy_parts[2]) == 'http') {
                         $proxy_type = CURLPROXY_HTTP;
                     } else {
                         if (strtolower($proxy_parts[2]) == 'socks5') {
                             $proxy_type = CURLPROXY_SOCKS5;
                         } else {
                             $proxy_type = $proxy_parts[2];
                         }
                     }
                     if (isset($proxy_parts[1])) {
                         $proxy_port = $proxy_parts[1];
                     } else {
                         $proxy_port = "80";
                     }
                     curl_setopt($sites[$i][0], CURLOPT_PROXY, "{$proxy_ip}:{$proxy_port}");
                     curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, $proxy_type);
                     if ($timer) {
                         crawlLog("Selecting proxy {$select_proxy} for {$url}");
                     }
                 }
             }
             if (!$minimal) {
                 curl_setopt($sites[$i][0], CURLOPT_HEADER, true);
             }
             //make lighttpd happier
             if (!$is_gopher) {
                 curl_setopt($sites[$i][0], CURLOPT_HTTPHEADER, $headers);
             }
             curl_setopt($sites[$i][0], CURLOPT_ENCODING, "");
             // ^ need to set for sites like att that use gzip
             if ($page_range_request > 0) {
                 curl_setopt($sites[$i][0], CURLOPT_RANGE, "0-" . $page_range_request);
             }
             if ($post_data != NULL) {
                 curl_setopt($sites[$i][0], CURLOPT_POST, true);
                 curl_setopt($sites[$i][0], CURLOPT_POSTFIELDS, $post_data[$i]);
             }
             curl_multi_add_handle($agent_handler, $sites[$i][0]);
         }
     }
     if ($timer) {
         crawlLog("  Init Get Pages " . changeInMicrotime($start_time));
     }
     $start_time = microtime();
     $start = time();
     //Wait for responses
     $running = NULL;
     $memory_limit = metricToInt(ini_get("memory_limit")) * 0.7;
     do {
         $mrc = curl_multi_exec($agent_handler, $running);
         $ready = curl_multi_select($agent_handler, 0.005);
     } while (memory_get_usage() < $memory_limit && time() - $start < PAGE_TIMEOUT && $running > 0);
     if (time() - $start > PAGE_TIMEOUT && $timer) {
         crawlLog("  TIMED OUT!!!");
     }
     if ($timer) {
         crawlLog("  Page Request time " . changeInMicrotime($start_time));
     }
     $start_time = microtime();
     //Process returned pages
     for ($i = 0; $i < $num_sites; $i++) {
         if ($timer) {
             crawlTimeoutLog("fetch_url initial processing of page %s of %s", $i, $num_sites);
         }
         if (!$minimal && isset($ip_holder[$i])) {
             rewind($ip_holder[$i]);
             $header = fread($ip_holder[$i], 8192);
             $ip_addresses = self::getCurlIp($header);
             fclose($ip_holder[$i]);
         }
         $is_gopher = false;
         if (isset($sites[$i][0]) && $sites[$i][0]) {
             // Get Data and Message Code
             $content = @curl_multi_getcontent($sites[$i][0]);
             $is_gopher = $sites[$i][CrawlConstants::IS_GOPHER_URL];
             /*
                If the Transfer-encoding was chunked then the Range header
                we sent was ignored. So we manually truncate the data
                here
             */
             if ($page_range_request > 0) {
                 $content = substr($content, 0, $page_range_request);
             }
             if (isset($content) && !$minimal && !$is_gopher) {
                 $site = self::parseHeaderPage($content, $value);
                 $sites[$i] = array_merge($sites[$i], $site);
                 if (isset($header)) {
                     $header = substr($header, 0, strpos($header, "\r\n\r\n") + 4);
                 } else {
                     $header = "";
                 }
                 $sites[$i][CrawlConstants::HEADER] = $header . $sites[$i][CrawlConstants::HEADER];
                 unset($header);
             } else {
                 if (isset($content) && !$minimal && $is_gopher) {
                     $sites[$i][CrawlConstants::HEADER] = $header;
                     $sites[$i][$value] = $content;
                     unset($header);
                 } else {
                     $sites[$i][$value] = $content;
                 }
             }
             if (!$minimal) {
                 $sites[$i][self::SIZE] = @curl_getinfo($sites[$i][0], CURLINFO_SIZE_DOWNLOAD);
                 $sites[$i][self::DNS_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_NAMELOOKUP_TIME);
                 $sites[$i][self::TOTAL_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_TOTAL_TIME);
                 $sites[$i][self::HTTP_CODE] = curl_getinfo($sites[$i][0], CURLINFO_HTTP_CODE);
                 if (!$sites[$i][self::HTTP_CODE] && !$is_gopher) {
                     $sites[$i][self::HTTP_CODE] = curl_error($sites[$i][0]);
                 } else {
                     $sites[$i][self::HTTP_CODE] = 200;
                 }
                 if ($ip_addresses) {
                     $sites[$i][self::IP_ADDRESSES] = $ip_addresses;
                 } else {
                     $sites[$i][self::IP_ADDRESSES] = array("0.0.0.0");
                 }
                 //Get Time, Mime type and Character encoding
                 $sites[$i][self::TIMESTAMP] = time();
                 if ($is_gopher) {
                     $path = UrlParser::getPath($sites[$i][self::URL]);
                     $filename = UrlParser::getDocumentFilename($sites[$i][self::URL]);
                     if (isset($path[1])) {
                         $gopher_type = $path[1];
                     } else {
                         $gopher_type = 1;
                     }
                     if ($gopher_type == 1) {
                         $sites[$i][self::TYPE] = "text/gopher";
                     } else {
                         if (in_array($gopher_type, array(0, 3, 6))) {
                             $sites[$i][self::TYPE] = "text/plain";
                             if ($gopher_type == 6) {
                                 $sites[$i][$value] = convert_uudecode($content);
                             }
                         } else {
                             if ($gopher_type == 'h') {
                                 $sites[$i][self::TYPE] = "text/html";
                             } else {
                                 if ($gopher_type == 'g') {
                                     $sites[$i][self::TYPE] = "image/gif";
                                 }
                             }
                         }
                     }
                     $path_info = pathinfo($filename);
                     if (!isset($sites[$i][self::TYPE]) && isset($path_info['extension'])) {
                         $sites[$i][self::TYPE] = UrlParser::guessMimeTypeFromFileName($filename);
                     } else {
                         if (!isset($sites[$i][self::TYPE])) {
                             $sites[$i][self::TYPE] = "unknown";
                         }
                     }
                 } else {
                     $type_parts = explode(";", curl_getinfo($sites[$i][0], CURLINFO_CONTENT_TYPE));
                     $sites[$i][self::TYPE] = strtolower(trim($type_parts[0]));
                 }
             }
             //curl_multi_remove_handle($agent_handler, $sites[$i][0]);
             curl_close($sites[$i][0]);
             if (isset($sites[$i]['ROBOT']) && $sites[$i]['ROBOT']) {
                 if (isset($sites[$i][self::TYPE]) && $sites[$i][self::TYPE] != "text/plain" && isset($sites[$i][CrawlConstants::LOCATION]) && count($site[CrawlConstants::LOCATION]) > 0) {
                     $sites[$i][self::TYPE] = "text/plain";
                     $sites[$i][self::HTTP_CODE] = "200";
                     $tmp = wordwrap($sites[$i][$value], 80);
                     $tmp_parts = explode("\n", $tmp);
                     $tmp = "# Suspect server misconfiguration\n";
                     $tmp .= "# Assume shouldn't crawl this site.\n";
                     $tmp .= "# Pretending got following robots.txt.\n";
                     $tmp .= "User-agent: *\n";
                     $tmp .= "Disallow: /\n";
                     $tmp .= "# Original error code: " . $sites[$i][self::HTTP_CODE] . "\n";
                     $tmp .= "# Original content:\n";
                     foreach ($tmp_parts as $part) {
                         $tmp = "#" . $part . "\n";
                     }
                     $sites[$i][$value] = $tmp;
                     $sites[$i][self::HTTP_CODE] = "200";
                     unset($site[CrawlConstants::LOCATION]);
                 }
             }
         }
         //end big if
     }
     //end for
     if ($timer) {
         crawlLog("  Get Page Content time " . changeInMicrotime($start_time));
     }
     curl_multi_close($agent_handler);
     return $sites;
 }
Esempio n. 4
0
 /**
  * Used to recompute the dictionary of an index archive -- either from
  * scratch using the index shard data or just using the current dictionary
  * but merging the tiers into one tier
  *
  * @param string $path file path to dictionary of an IndexArchiveBundle
  * @param int $max_tier tier up to which the dictionary tiers should be
  *     merge (typically a value greater than the max_tier of the
  *     dictionary)
  */
 function reindexIndexArchive($path, $max_tier = -1)
 {
     if ($this->getArchiveKind($path) != "IndexArchiveBundle") {
         echo "\n{$path} ...\n" . "  is not an IndexArchiveBundle so cannot be re-indexed\n\n";
         exit;
     }
     $shards = glob($path . "/posting_doc_shards/index*");
     if (is_array($shards)) {
         if ($max_tier == -1) {
             $dbms_manager = DBMS . "Manager";
             $db = new $dbms_manager();
             $db->unlinkRecursive($path . "/dictionary", false);
             IndexDictionary::makePrefixLetters($path . "/dictionary");
         }
         $dictionary = new IndexDictionary($path . "/dictionary");
         if ($max_tier == -1) {
             $max_generation = 0;
             foreach ($shards as $shard_name) {
                 $file_name = UrlParser::getDocumentFilename($shard_name);
                 $generation = (int) substr($file_name, strlen("index"));
                 $max_generation = max($max_generation, $generation);
             }
             for ($i = 0; $i < $max_generation + 1; $i++) {
                 $shard_name = $path . "/posting_doc_shards/index{$i}";
                 echo "\nShard {$i}\n";
                 $shard = new IndexShard($shard_name, $i, NUM_DOCS_PER_GENERATION, true);
                 $dictionary->addShardDictionary($shard);
             }
             $max_tier = $dictionary->max_tier;
         }
         echo "\nFinal Merge Tiers\n";
         $dictionary->mergeAllTiers(NULL, $max_tier);
         $db->setWorldPermissionsRecursive($path . "/dictionary");
         echo "\nReindex complete!!\n";
     } else {
         echo "\n{$path} ...\n" . "  does not contain posting shards so cannot be re-indexed\n\n";
     }
 }
Esempio n. 5
0
 /**
  * Gets the resource $_REQUEST['n'] from APP_DIR/$_REQUEST['f'] or
  * CRAWL_DIR/$_REQUEST['f']  after cleaning
  */
 function get()
 {
     if (!isset($_REQUEST['n']) || !isset($_REQUEST['f'])) {
         return;
     }
     $name = $this->clean($_REQUEST['n'], "string");
     if (in_array($_REQUEST['f'], array("css", "scripts", "resources"))) {
         /* notice in this case we don't check if request come from a
               legitimate source but we do try to restrict it to being
               a file (not a folder) in the above array
            */
         $base_dir = $this->getBaseFolder();
         if (!$base_dir) {
             header('HTTP/1.1 401 Unauthorized');
             echo "<html><head><title>401 Unauthorized</title></head>" . "<body><p>401 Unauthorized</p></body></html>";
             return;
         }
         $type = UrlParser::getDocumentType($name);
         $name = UrlParser::getDocumentFilename($name);
         $name = $type != "" ? "{$name}.{$type}" : $name;
         if (isset($_REQUEST['t'])) {
             $name .= ".jpg";
         }
     } else {
         if (in_array($_REQUEST['f'], array("cache"))) {
             /*  perform check since these request should come from a known
                     machine
                 */
             if (!$this->checkRequest()) {
                 return;
             }
             $folder = $_REQUEST['f'];
             $base_dir = CRAWL_DIR . "/{$folder}";
         } else {
             return;
         }
     }
     if (isset($_REQUEST['o']) && isset($_REQUEST['l'])) {
         $offset = $this->clean($_REQUEST['o'], "int");
         $limit = $this->clean($_REQUEST['l'], "int");
     }
     $path = "{$base_dir}/{$name}";
     if (file_exists($path)) {
         $mime_type = mimeType($path);
         $size = filesize($path);
         $start = 0;
         $end = $size - 1;
         header("Content-type: {$mime_type}");
         header("Accept-Ranges: bytes");
         if (isset($_SERVER['HTTP_RANGE'])) {
             $this->serveRangeRequest($path, $size, $start, $end);
             return;
         }
         header("Content-Length: " . $size);
         header("Content-Range: bytes {$start}-{$end}/{$size}");
         if (isset($offset) && isset($limit)) {
             echo file_get_contents($path, false, NULL, $offset, $limit);
         } else {
             readfile($path);
         }
     } else {
         header("Location:./error.php");
     }
 }