/**
  * Initiates a new PHPCrawlerCookieDescriptor-object.
  *
  * @param string $source_url URL the cookie was send from.
  * @param string $name       Cookie-name
  * @param string $value      Cookie-value
  * @param string $expires    Expire-string, e.g. "Sat, 08-Aug-2020 23:59:08 GMT"
  * @param string $path       Cookie-path
  * @param string $domain     Cookie-domain
  * @internal
  */
 public function __construct($source_url, $name, $value, $expires = null, $path = null, $domain = null)
 {
     // For cookie-specs, see e.g. http://curl.haxx.se/rfc/cookie_spec.html
     $this->name = $name;
     $this->value = $value;
     $this->expires = $expires;
     $this->path = $path;
     $this->domain = $domain;
     $source_url_parts = PHPCrawlerUtils::splitURL($source_url);
     // Source-domain
     $this->source_domain = $source_url_parts["domain"];
     // Source-URL
     $this->source_url = $source_url;
     // Send-time
     $this->cookie_send_time = PHPCrawlerBenchmark::getmicrotime();
     // Expire-date to timetsamp
     if ($this->expires != null) {
         $this->expire_timestamp = @strtotime($this->expires);
     }
     // If domain doesn't start with "." -> add it (see RFC)
     if ($this->domain != null && substr($this->domain, 0, 1) != ".") {
         $this->domain = "." . $this->domain;
     }
     // Comeplete missing values
     // If domain no set -> domain is the host of the source-url WITHOUT leading "."! (see RFC)
     if ($this->domain == null) {
         $this->domain = $source_url_parts["host"];
     }
     // If path not set
     if ($this->path == null) {
         $this->path = $source_url_parts["path"];
     }
 }
 /**
  * Returns all cookies from the cache that are adressed to the given URL
  *
  * @param string $target_url The target-URL
  * @return array  Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
  */
 public function getCookiesForUrl($target_url)
 {
     PHPCrawlerBenchmark::start("getting_cookies_from_cache");
     $url_parts = PHPCrawlerUtils::splitURL($target_url);
     $return_cookies = array();
     $Result = $this->PDO->query("SELECT * FROM cookies WHERE source_domain = '" . $url_parts["domain"] . "';");
     $rows = $Result->fetchAll(PDO::FETCH_ASSOC);
     $Result->closeCursor();
     $cnt = count($rows);
     for ($x = 0; $x < $cnt; $x++) {
         // Does the cookie-domain match?
         // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html:
         // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com"
         if ($rows[$x]["domain"] == $url_parts["host"] || preg_match("#" . preg_quote($rows[$x]["domain"]) . "\$#", $url_parts["host"])) {
             // Does the path match?
             if (preg_match("#^" . preg_quote($rows[$x]["path"]) . "#", $url_parts["path"])) {
                 $Cookie = new PHPCrawlerCookieDescriptor($rows[$x]["source_url"], $rows[$x]["name"], $rows[$x]["value"], $rows[$x]["expires"], $rows[$x]["path"], $rows[$x]["domain"]);
                 $return_cookies[$Cookie->name] = $Cookie;
                 // Use cookie-name as index to avoid double-cookies
             }
         }
     }
     // Convert to numeric array
     $return_cookies = array_values($return_cookies);
     PHPCrawlerBenchmark::stop("getting_cookies_from_cache");
     return $return_cookies;
 }
 /**
  * Returns all cookies from the cache that are adressed to the given URL
  *
  * @param string $target_url The target-URL
  * @return array  Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
  */
 public function getCookiesForUrl($target_url)
 {
     $url_parts = PHPCrawlerUtils::splitURL($target_url);
     $target_domain = $url_parts["domain"];
     // e.g. acme.com
     $return_cookies = array();
     // Iterate over all cookies of this domain
     @reset($this->cookies[$target_domain]);
     while (list($hash) = @each($this->cookies[$target_domain])) {
         $Cookie = $this->cookies[$target_domain][$hash];
         // Does the cookie-domain match?
         // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html:
         // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com"
         if ($Cookie->domain == $url_parts["host"] || preg_match("#" . preg_quote($Cookie->domain) . "\$#", $url_parts["host"])) {
             // Does the path match?
             if (preg_match("#^" . preg_quote($Cookie->path) . "#", $url_parts["path"])) {
                 $return_cookies[$Cookie->name] = $Cookie;
                 // Use cookie-name as index to avoid double-cookies
             }
         }
     }
     // Convert to numeric array
     $return_cookies = array_values($return_cookies);
     return $return_cookies;
 }
 /**
  * Returns all cookies from the cache that are adressed to the given URL
  *
  * @param string $target_url The target-URL
  * @return array  Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
  */
 public function getCookiesForUrl($target_url)
 {
     PHPCrawlerBenchmark::start("getting_cookies_from_cache");
     $url_parts = PHPCrawlerUtils::splitURL($target_url);
     $return_cookies = array();
     $result = db_query("SELECT * FROM {" . $this->table . "} WHERE source_domain = '" . $url_parts["domain"] . "' AND crawler_id = '" . $this->crawler_id . "';");
     $rows = $result->fetchAllAssoc('id');
     //     $rows = $this->conn->query("SELECT * FROM " . $this->table . " WHERE source_domain = '".$url_parts["domain"]."' AND crawler_id = '" . $this->crawler_id . "';")->fetchAllAssoc('id');
     // drupal_set_message('<pre>PHPCrawlerD7CookieCache::getCookiesForUrl ' . print_r($rows, 1) . '</pre>');
     $cnt = count($rows);
     for ($x = 0; $x < $cnt; $x++) {
         // Does the cookie-domain match?
         // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html:
         // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com"
         if ($rows[$x]->domain == $url_parts["host"] || preg_match("#" . preg_quote($rows[$x]->domain) . "\$#", $url_parts["host"])) {
             // Does the path match?
             if (preg_match("#^" . preg_quote($rows[$x]->path) . "#", $url_parts->path)) {
                 $Cookie = new PHPCrawlerCookieDescriptor($rows[$x]->source_url, $rows[$x]->name, $rows[$x]->value, $rows[$x]->expires, $rows[$x]->path, $rows[$x]->domain);
                 $return_cookies[$Cookie->name] = $Cookie;
                 // Use cookie-name as index to avoid double-cookies
             }
         }
     }
     // Convert to numeric array
     $return_cookies = array_values($return_cookies);
     PHPCrawlerBenchmark::stop("getting_cookies_from_cache");
     return $return_cookies;
 }
 /**
  * Sets/writes the current crawler-status
  *
  * @param PHPCrawlerStatus $crawler_status The status to set
  */
 public function setCrawlerStatus(PHPCrawlerStatus $crawler_status)
 {
     $this->crawlerStatus = $crawler_status;
     // Write crawler-status back to file
     if ($this->write_status_to_file == true) {
         PHPCrawlerUtils::serializeToFile($this->working_directory . "crawlerstatus.tmp", $crawler_status);
     }
 }
 /**
  * Initiates an new PHPCrawlerResponseHeader.
  *
  * @param string $header_string A complete response-header as it was send by the server
  * @param string $source_url    The URL of the website the header was recevied from.
  * @internal
  */
 public function __construct($header_string, $source_url)
 {
     $this->header_raw = $header_string;
     $this->source_url = $source_url;
     $this->http_status_code = PHPCrawlerUtils::getHTTPStatusCode($header_string);
     $this->content_type = strtolower(PHPCrawlerUtils::getHeaderValue($header_string, "content-type"));
     $this->content_length = strtolower(PHPCrawlerUtils::getHeaderValue($header_string, "content-length"));
     $this->cookies = PHPCrawlerUtils::getCookiesFromHeader($header_string, $source_url);
 }
 /**
  * Returns/reads the current crawler-status
  *
  * @return PHPCrawlerStatus The current crawlerstatus as a PHPCrawlerStatus-object
  */
 public function getCrawlerStatus()
 {
     // Get crawler-status from file if crawler is multiprocessed
     if ($this->multiprocess_mode == PHPCrawlerMultiProcessModes::MPMODE_CHILDS_EXECUTES_USERCODE) {
         $this->crawlerStatus = PHPCrawlerUtils::deserializeFromFile($this->working_directory . "crawlerstatus.tmp");
         if ($this->crawlerStatus == null) {
             $this->crawlerStatus = new PHPCrawlerStatus();
         }
     }
     return $this->crawlerStatus;
 }
 /**
  * Returns the PHPCrawlerUrlPartsDescriptor-object for the given URL.
  *
  * @return PHPCrawlerUrlPartsDescriptor
  */
 public static function fromURL($url)
 {
     $parts = PHPCrawlerUtils::splitURL($url);
     $tmp = new PHPCrawlerUrlPartsDescriptor();
     $tmp->protocol = $parts["protocol"];
     $tmp->host = $parts["host"];
     $tmp->path = $parts["path"];
     $tmp->file = $parts["file"];
     $tmp->domain = $parts["domain"];
     $tmp->port = $parts["port"];
     $tmp->auth_username = $parts["auth_username"];
     $tmp->auth_password = $parts["auth_password"];
     return $tmp;
 }
 /**
  * Adds a basic-authentication (username and password) to the list of authentications that will be send
  * with requests.
  *
  * @param string $url_regex Regular expression defining the URL(s) the authentication should be send to.
  * @param string $username  The username
  * @param string $password  The password
  *
  * @return bool
  */
 public function addBasicAuthentication($url_regex, $username, $password)
 {
     // Check regex
     $regex_okay = PHPCrawlerUtils::checkRegexPattern($url_regex);
     if ($regex_okay == true) {
         // Add authentication to basic_authentications-array
         $tmp = array();
         $tmp["url_regex"] = $url_regex;
         $tmp["username"] = $username;
         $tmp["password"] = $password;
         $this->basic_authentications[] = $tmp;
         return true;
     } else {
         return false;
     }
 }
Esempio n. 10
0
 /**
  * Adds a rule to the list of rules that decide which URLs found on a page should be ignored by the crawler. 
  */
 public function addURLFilterRule($regex)
 {
     $check = PHPCrawlerUtils::checkRegexPattern($regex);
     // Check pattern
     if ($check == true) {
         $this->url_filter_rules[] = trim($regex);
     }
     return $check;
 }
 /**
  * Checks whether the hostname of the given URL is already cached
  *
  * @param PHPCrawlerURLDescriptor $URL The URL
  * @return bool
  */
 public function urlHostInCache(PHPCrawlerURLDescriptor $URL)
 {
     $url_parts = PHPCrawlerUtils::splitURL($URL->url_rebuild);
     return $this->hostInCache($url_parts["host"]);
 }
 protected function addLinkToCache($link_raw, $link_code, $link_text = "")
 {
     //PHPCrawlerBenchmark::start("preparing_link_for_cache");
     // If liks already was found and processed -> skip this link
     if (isset($this->found_links_map[$link_raw])) {
         return;
     }
     // Rebuild URL from link
     $url_rebuild = PHPCrawlerUtils::buildURLFromLink($link_raw, $this->baseUrlParts);
     // If link coulnd't be rebuild
     if ($url_rebuild == null) {
         return;
     }
     // Create an PHPCrawlerURLDescriptor-object with URL-data
     $UrlDescriptor = new PHPCrawlerURLDescriptor($url_rebuild, $link_raw, $link_code, $link_text, $this->SourceUrl->url_rebuild);
     // Add the PHPCrawlerURLDescriptor-object to LinkCache
     $this->LinkCache->addURL($UrlDescriptor);
     // Add the PHPCrawlerURLDescriptor-object to found-links-array
     $map_key = $link_raw;
     $this->found_links_map[$map_key] = true;
     //PHPCrawlerBenchmark::stop("preparing_link_for_cache");
 }
 function receivePage($url_to_crawl, $referer_url)
 {
     // Check if tmp-file was set by the user, otherwise set a default one
     if ($this->tmp_file == "") {
         $this->tmp_file = uniqid(time()) . ".tmp";
     }
     // Define some vars
     $source_read = "";
     $bytes_received = 0;
     $stream_to_memory = false;
     $stream_to_file = false;
     // Split the url to crawl into its elements (host, path, port and stuff)
     $url_parts = PHPCrawlerUtils::splitURL($url_to_crawl);
     $protocol = $url_parts["protocol"];
     $host = $url_parts["host"];
     $path = $url_parts["path"];
     $query = $url_parts["query"];
     $file = $url_parts["file"];
     $port = $url_parts["port"];
     // If the host was already visited so far
     // -> get the ip from our host-ip-array, otherwise
     // get the IP and add the entry to the array.
     if (isset($this->host_ip_table[$host])) {
         $ip = $this->host_ip_table[$host];
     } else {
         $ip = $this->host_ip_table[$host] = gethostbyname($host);
         // Host switched and wasnt "visited" before.
         // So read the robots.txt-file for this new host (if wanted)
         if ($this->use_robots_txt_files == true) {
             $this->robotsTxtHandler->processRobotsTxt($protocol, $host, $port, $this->user_agent_string);
         }
     }
     // Is this URL allowed to be requested by the robots.txt-file of this host?
     $url_disallowed = false;
     if ($this->use_robots_txt_files == true) {
         $host_url = $protocol . $host . ":" . $port;
         $url_disallowed = $this->robotsTxtHandler->checkIfUrlDisallowed($url_to_crawl, $host_url);
     }
     // Check the protocol (http or https) and build the
     // host-string for fsockopen
     if ($protocol == "https://") {
         $host_str = "ssl://" . $ip;
     } else {
         $host_str = $ip;
     }
     // normal connect
     // Check if an authentication should be send
     $authentication = PHPCrawlerUtils::getAuthenticationForURL($this->basic_authentications, $url_to_crawl);
     // Error-codes
     // 0 - couldnt connect to server / page within timeout-time
     // 1 - stopped reading from socket, read-timeout reached BEFORE EOF()
     // Open socket-connection
     if ($url_disallowed == false) {
         $s = @fsockopen($host_str, $port, $e, $t, $this->socket_mean_timeout);
     } else {
         return false;
         // Return false if the URL was completely ignored
     }
     if ($s == false) {
         $error_string = $t;
         $error_code = $e;
         if ($t == "" && $e == "") {
             $error_code = 0;
             $error_string = "Couldn't connect to server";
         }
     } else {
         $header_found = false;
         // will get true if the header of the page was extracted
         // Build header to send
         $headerlines_to_send[] = "GET " . $path . $file . $query . " HTTP/1.0\r\n";
         $headerlines_to_send[] = "HOST: " . $host . "\r\n";
         // Referer
         if ($referer_url != "") {
             $headerlines_to_send[] = "Referer: {$referer_url}\r\n";
         }
         // Cookies
         if ($this->handle_cookies == true) {
             $cookie_string = PHPCrawlerUtils::buildHeaderCookieString($this->cookies, $host);
         }
         if (isset($cookie_string)) {
             $headerlines_to_send[] = "Cookie: " . $cookie_string . "\r\n";
         }
         // Authentication
         if (count($authentication) > 0) {
             $auth_string = base64_encode($authentication["username"] . ":" . $authentication["password"]);
             $headerlines_to_send[] = "Authorization: Basic " . $auth_string . "\r\n";
         }
         // Rest of header
         $headerlines_to_send[] = "User-Agent: " . str_replace("\n", "", $this->user_agent_string) . "\r\n";
         $headerlines_to_send[] = "Connection: close\r\n";
         $headerlines_to_send[] = "\r\n";
         // Now send the header
         for ($x = 0; $x < count($headerlines_to_send); $x++) {
             // Send header-line
             fputs($s, $headerlines_to_send[$x]);
             // Put together lines to $header_send
             if (isset($header_send)) {
                 $header_send .= $headerlines_to_send[$x];
             } else {
                 $header_send = $headerlines_to_send[$x];
             }
         }
         unset($header_lines);
         $status = socket_get_status($s);
         // Now read from socket
         // UNTIL timeout reached OR eof() OR content-type shouldnt be followed
         // OR traffic-limit reached or ...
         while (!isset($stop)) {
             socket_set_timeout($s, $this->socket_read_timeout);
             // Read from socket
             $line_read = @fgets($s, 1024);
             // The @ is to avoid the strange "SSL fatal protocol error"-warning that
             // appears in some environments without any reasons
             $source_read .= $line_read;
             // do this anyway
             // If we want the content in tmp-file -> write line to TMP-file
             if ($header_found == true && $stream_to_file == true && $line_read) {
                 unset($check);
                 $check = @fwrite($fp, $line_read);
                 if ($check == false) {
                     $error_code = "2000";
                     $error_string = "Couldn't write to TMP-file " . $this->tmp_file;
                 }
             }
             // Count bytes of the content (not the header)
             if ($header_found == true) {
                 $bytes_received = $bytes_received + strlen($line_read);
             }
             // Check for traffic limit and stop receiving if reached
             if ($this->traffic_limit_complete_page == false && $this->traffic_limit_all > 0) {
                 if (strlen($source_read) + $this->traffic_all > $this->traffic_limit_all) {
                     $stop = true;
                     $received_completly = false;
                     $page_data["traffic_limit_reached"] = true;
                 }
             }
             // Check for pagesize-limit
             if ($header_found == true && $bytes_received > $this->pagesize_limit && $this->pagesize_limit > 0) {
                 $stop = true;
                 $received_completly = false;
             }
             // "Cut" Header in seperate var $header and handle it
             if ($header_found == false && substr($source_read, -4, 4) == "\r\n\r\n") {
                 $header = substr($source_read, 0, strlen($source_read) - 2);
                 $actual_content_type = PHPCrawlerUtils::getHeaderTag("content-type", $header);
                 $source_read = "";
                 $header_found = true;
                 // Get the http-status-code
                 $http_status_code = PHPCrawlerUtils::getHTTPStatusCode($header);
                 // Should this content-type be streamed into memory (true/false) ?
                 $stream_to_memory = PHPCrawlerUtils::decideStreamToMemory($header, $this->receive_to_memory_matches);
                 // Should this content-type be streamed into tmp-file (true/false) ?
                 $stream_to_file = PHPCrawlerUtils::decideStreamToTmpFile($header, $this->receive_to_file_matches);
                 // No ? then open TMP-file for the stream
                 if ($stream_to_file == true) {
                     $fp = @fopen($this->tmp_file, "w");
                     if ($fp == false) {
                         $error_code = "2000";
                         $error_string = "Couldn't open TMP-file" . $this->tmp_file;
                     }
                 }
                 // Header found here -> check if source should be followed (content-type)
                 $follow = PHPCrawlerUtils::decideFollow($header, $this->follow_content_type);
                 // no ?? then stop with this page !
                 if ($follow == false) {
                     $stop = true;
                 } else {
                     $received_completly = true;
                     // just init, may switch later on !
                 }
                 // Check if a cookie was send with the header and store it
                 // (if wanted)
                 if ($this->handle_cookies == true) {
                     PHPCrawlerUtils::getCookieData($header, $this->cookies, $host);
                 }
             }
             // end cut and handle header
             // Get status of socket to check timeout and EOF
             $status = socket_get_status($s);
             // Now, if the source-buffer is filled or EOF is reached
             // -> look for links in the buffer, put the found links into
             // array $links_found_in_page and then empty the buffer BUT
             // COPY THE LAST FEW BYTES of the old buffer into the new one !
             // This has to be done because of links that take more than a single
             // line !
             // And yes, only makes sense if we dont want to have the whole content
             // in memory anyway AND if the content-type is text/html!
             if ($header_found == true && $stream_to_memory == false) {
                 if (strlen($source_read) >= 100000 || $status["eof"] == true) {
                     if (preg_match("/text\\/html/ i", $actual_content_type)) {
                         $links_found_in_buffer = PHPCrawlerUtils::findLinks($source_read, $links_found_in_page, $this->aggressive_link_extraction, $this->linktags_to_extract, $page_url_map);
                         $source_read = substr($source_read, -1500);
                     }
                 }
             }
             // Check timeout
             if ($status["timed_out"] == true) {
                 $error_code = 1000;
                 // ahem..which int to give ??
                 $error_string = "socketstream timed out";
                 $stop = true;
                 $received_completly = false;
             }
             // Check eof
             if ($status["eof"] == true) {
                 $stop = true;
             }
         }
         fclose($s);
         // close socket
         if (isset($fp) && $fp != false) {
             fclose($fp);
         }
         // close tmp file if used
     }
     // echo "Get page:".($this->getmicrotime() - $start);
     // Now, HERE, if the whole content/source was received into memory,
     // we are looking for the links in the complete source (faster)
     // it only makes sense if content-type is text/html !
     if ($stream_to_memory == true) {
         unset($links_found_in_page);
         if (preg_match("/text\\/html/ i", $actual_content_type)) {
             // $start = $this->getmicrotime();
             PHPCrawlerUtils::findLinks($source_read, $links_found_in_page, $this->aggressive_link_extraction, $this->linktags_to_extract, $page_url_map);
             // echo "Find links:".($this->getmicrotime() - $start);
         }
     }
     // Add the "refering_url" to the array-elements
     if (isset($links_found_in_page)) {
         for ($x = 0; $x < count($links_found_in_page); $x++) {
             $links_found_in_page[$x]["referer_url"] = $url_to_crawl;
         }
     }
     // Page crawled,
     // return header, source, followed (true/false) and all we got here
     unset($page_data);
     if (isset($error_code)) {
         $page_data["error_code"] = $error_code;
     } else {
         $page_data["error_code"] = false;
     }
     if (isset($error_string)) {
         $page_data["error_string"] = $error_string;
     } else {
         $page_data["error_string"] = false;
     }
     if (isset($follow)) {
         $page_data["received"] =& $follow;
     } else {
         $page_data["received"] = false;
     }
     if (isset($received_completly)) {
         $page_data["received_completly"] =& $received_completly;
     } else {
         $page_data["received_completly"] = false;
     }
     $page_data["received_completely"] =& $page_data["received_completly"];
     // Wrote "completely" it wrong in prev. version,
     if (isset($bytes_received)) {
         $page_data["bytes_received"] = $bytes_received;
     } else {
         $page_data["bytes_received"] = 0;
     }
     if (isset($header)) {
         $page_data["header"] =& $header;
     } else {
         $page_data["header"] = false;
     }
     if (isset($http_status_code)) {
         $page_data["http_status_code"] =& $http_status_code;
     } else {
         $page_data["http_status_code"] = false;
     }
     if (isset($actual_content_type)) {
         $page_data["content_type"] = $actual_content_type;
     } else {
         $page_data["content_type"] = false;
     }
     // TMP-file infos and that
     $page_data["content_tmp_file"] = $page_data["received_to_file"] = false;
     $page_data["source"] = $page_data["content"] = $page_data["received_to_memory"] = false;
     if (isset($page_data["received"])) {
         if ($stream_to_file == true) {
             $page_data["content_tmp_file"] = $this->tmp_file;
             $page_data["received_to_file"] = true;
         }
         if ($stream_to_memory == true) {
             $page_data["source"] =& $source_read;
             $page_data["content"] =& $source_read;
             $page_data["received_to_memory"] = true;
         }
     }
     // Additional infos for the override-function handlePageData()
     $page_data["protocol"] = $protocol;
     $page_data["port"] = $port;
     $page_data["host"] = $host;
     $page_data["path"] = $path;
     $page_data["file"] = $file;
     $page_data["query"] = $query;
     $page_data["header_send"] =& $header_send;
     $page_data["referer_url"] = $referer_url;
     // "Normailzed" URL and referer-URL (f.e. without port if port is 80 and protocol is http)
     $page_data["url"] = $url_to_crawl;
     // All links found in this page
     $page_data["links_found"] =& $links_found_in_page;
     // Increase SUM of traffic alltogether this instance received
     $this->traffic_all = $this->traffic_all + strlen($page_data["header"]) + $page_data["bytes_received"];
     // Set flag if traffic-limit is reached
     if ($this->traffic_all > $this->traffic_limit_all && $this->traffic_limit_all != 0) {
         $page_data["traffic_limit_reached"] = true;
     }
     if (!isset($page_data["traffic_limit_reached"])) {
         $page_data["traffic_limit_reached"] = false;
     }
     return $page_data;
 }
 /**
  * Adds a Link-Priority-Level
  *
  * @param string $regex
  * @param int    $level
  */
 public function addLinkPriority($regex, $level)
 {
     $c = count($this->url_priorities);
     $this->url_priorities[$c]["match"] = trim($regex);
     $this->url_priorities[$c]["level"] = trim($level);
     // Sort url-priortie-array so that high priority-levels come firts.
     PHPCrawlerUtils::sort2dArray($this->url_priorities, "level", SORT_DESC);
 }
Esempio n. 15
0
 function addBasicAuthentication($expression, $username, $password)
 {
     $this->initCrawler();
     $check = PHPCrawlerUtils::checkExpressionPattern($expression);
     // Check pattern
     if ($check == true) {
         $c = count($this->pageRequest->basic_authentications);
         $this->pageRequest->basic_authentications[$c]["match"] = $expression;
         $this->pageRequest->basic_authentications[$c]["username"] = $username;
         $this->pageRequest->basic_authentications[$c]["password"] = $password;
         return true;
     } else {
         return false;
     }
 }
 /** 
  * Returns the Robots.txt-URL related to the given URL
  *
  * @param PHPCrawlerURLDescriptor $Url  The URL as PHPCrawlerURLDescriptor-object
  * @return PHPCrawlerURLDescriptor Url of the related to the passed URL.
  */
 public static function getRobotsTxtURL(PHPCrawlerURLDescriptor $Url)
 {
     $url_parts = PHPCrawlerUtils::splitURL($Url->url_rebuild);
     $robots_txt_url = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . "/robots.txt";
     return new PHPCrawlerURLDescriptor($robots_txt_url);
 }
Esempio n. 17
0
 /**
  * Adds a rule to the list of rules that decide in what kind of documents the crawler
  * should search for links in (regarding their content-type)
  *
  * By default the crawler ONLY searches for links in documents of type "text/html".
  * Use this method to add one or more other content-types the crawler should check for links.
  *
  * Example:
  * <code>
  * $crawler->addLinkSearchContentType("#text/css# i");
  * $crawler->addLinkSearchContentType("#text/xml# i");
  * </code>
  * These rules let the crawler search for links in HTML-, CSS- ans XML-documents.
  *
  * <b>Please note:</b> It is NOT recommended to let the crawler checkfor links in EVERY document-
  * type! This could slow down the crawling-process dramatically (e.g. if the crawler receives large
  * binary-files like images and tries to find links in them).
  *
  * @param string $regex Regular-expression defining the rule
  * @return bool         TRUE if the rule was successfully added
  */
 function addLinkSearchContentType($regex)
 {
     $this->initCrawler();
     $check = PHPCrawlerUtils::checkExpressionPattern($regex);
     // Check pattern
     if ($check == true) {
         $this->pageRequest->linksearch_content_types[] = trim($regex);
     }
     return $check;
 }
 function buidlNonFollowMatches($applying_lines, $base_url)
 {
     // First, get all "Disallow:"-pathes
     $disallow_pathes = array();
     for ($x = 0; $x < count($applying_lines); $x++) {
         if (preg_match("#^Disallow:# i", $applying_lines[$x])) {
             preg_match("#^Disallow:[ ]*(.*)#", $applying_lines[$x], $match);
             $disallow_pathes[] = trim($match[1]);
         }
     }
     // Works like this:
     // The base-url is http://www.foo.com.
     // The driective says: "Disallow: /bla/"
     // This means: The nonFollowMatch is "#^http://www\.foo\.com/bla/#"
     $normalized_base_url = PHPCrawlerUtils::normalizeURL($base_url);
     $non_follow_expressions = array();
     for ($x = 0; $x < count($disallow_pathes); $x++) {
         // If the disallow-path is empty -> simply ignore it
         if ($disallow_pathes[$x] == "") {
             continue;
         }
         $non_follow_path_complpete = $normalized_base_url . substr($disallow_pathes[$x], 1);
         // "http://www.foo.com/bla/"
         $non_follow_exp = preg_quote($non_follow_path_complpete, "#");
         // "http://www\.foo\.com/bla/"
         $non_follow_exp = "#^" . $non_follow_exp . "#";
         // "#^http://www\.foo\.com/bla/#"
         $non_follow_expressions[] = $non_follow_exp;
     }
     return $non_follow_expressions;
 }
Esempio n. 19
0
 function buildURL($link, $actual_url, $url_parts_actual = "")
 {
     // Important: Function has to return a FULL URL, ioncluing
     // the port !!
     if ($url_parts_actual == "") {
         $url_parts_actual = phpcrawlerutils::splitURL($actual_url);
     }
     // Entities-replacements
     $entities = array("'&(quot|#34);'i", "'&(amp|#38);'i", "'&(lt|#60);'i", "'&(gt|#62);'i", "'&(nbsp|#160);'i", "'&(iexcl|#161);'i", "'&(cent|#162);'i", "'&(pound|#163);'i", "'&(copy|#169);'i");
     $replace = array("\"", "&", "<", ">", " ", chr(161), chr(162), chr(163), chr(169));
     $link = str_replace("\n", "", $link);
     $link = str_replace("\r", "", $link);
     // Remove "#..." at end, but ONLY at the end,
     // not if # is at the beginning !
     $link = preg_replace("/^(.{1,})#.{0,}\$/", "\\1", $link);
     // Cases
     // Strange link like "//foo.htm" -> make it to "http://foo.html"
     if (substr($link, 0, 2) == "//") {
         $link = "http:" . $link;
         $link = phpcrawlerutils::rebuildURL(phpcrawlerutils::splitURL($link));
     } elseif (substr($link, 0, 1) == "/") {
         $link = $url_parts_actual["protocol"] . $url_parts_actual["host"] . ":" . $url_parts_actual["port"] . $link;
     } elseif (substr($link, 0, 2) == "./") {
         $link = $url_parts_actual["protocol"] . $url_parts_actual["host"] . ":" . $url_parts_actual["port"] . $url_parts_actual["path"] . substr($link, 2);
     } elseif (preg_match("/^[^\\/]{1,}(:\\/\\/)/", $link)) {
         if (substr($link, 0, 7) == "http://" || substr($link, 0, 8) == "https://") {
             $link = phpcrawlerutils::rebuildURL(phpcrawlerutils::splitURL($link));
         } else {
             $link = "";
         }
         // Kick out unsupported protocols
     } elseif (preg_match("/^[a-zA-Z]{0,}:[^\\/]{0,1}/", $link)) {
         $link = "";
     } elseif (substr($link, 0, 3) == "../") {
         $new_path = $url_parts_actual["path"];
         while (substr($link, 0, 3) == "../") {
             $new_path = preg_replace('/\\/[^\\/]{0,}\\/$/', "/", $new_path);
             $link = substr($link, 3);
         }
         $link = $url_parts_actual["protocol"] . $url_parts_actual["host"] . ":" . $url_parts_actual["port"] . $new_path . $link;
     } elseif (substr($link, 0, 1) == "#") {
         $link = "";
     } elseif ($link == "") {
         $link = $actual_url;
     } else {
         $link = $url_parts_actual["protocol"] . $url_parts_actual["host"] . ":" . $url_parts_actual["port"] . $url_parts_actual["path"] . $link;
     }
     // Now, at least, replace all HTMLENTITIES with normal text !!
     // Ie: HTML-Code of the link is: <a href="index.php?x=1&amp;y=2">
     // -> Link has to be "index.php?x=1&y=2"
     $link = preg_replace($entities, $replace, $link);
     $link = rawurldecode($link);
     $link = str_replace(" ", "%20", $link);
     // "Normalize" URL
     $link = PHPCrawlerUtils::normalizeUrl($link);
     return $link;
 }
Esempio n. 20
0
 /**
  * Adds a regular expression togehter with a priority-level to the list of rules that decide what links should be prefered.
  *
  * Links/URLs that match an expression with a high priority-level will be followed before links with a lower level.
  * All links that don't match with any of the given rules will get the level 0 (lowest level) automatically.
  *
  * The level can be any positive integer.
  *
  * <b>Example:</b>
  *
  * Telling the crawler to follow links that contain the string "forum" before links that contain ".gif" before all other found links.
  * <code>
  * $crawler->addLinkPriority("/forum/", 10);
  * $cralwer->addLinkPriority("/\.gif/", 5);
  * </code>
  *
  * @param string $regex  Regular expression definig the rule
  * @param int    $level  The priority-level
  *
  * @return bool  TRUE if a valid preg-pattern is given as argument and was succsessfully added, otherwise it returns FALSE.
  * @section 10 Other settings
  */
 function addLinkPriority($regex, $level)
 {
     $check = PHPCrawlerUtils::checkRegexPattern($regex);
     // Check pattern
     if ($check == true && preg_match("/^[0-9]*\$/", $level)) {
         $c = count($this->link_priority_array);
         $this->link_priority_array[$c]["match"] = trim($regex);
         $this->link_priority_array[$c]["level"] = trim($level);
         return true;
     } else {
         return false;
     }
 }
 /**
  * Adds a rule to the list of rules that decide what kind of documents should get
  * checked for links in (regarding their content-type)
  *
  * @param string $regex Regular-expression defining the rule
  * @return bool         TRUE if the rule was successfully added
  */
 public function addLinkSearchContentType($regex)
 {
     $check = PHPCrawlerUtils::checkRegexPattern($regex);
     // Check pattern
     if ($check == true) {
         $this->linksearch_content_types[] = trim($regex);
     }
     return $check;
 }
Esempio n. 22
0
 function handleDocumentInfo($DocInfo)
 {
     echo "<table class=intbl>";
     // Loop over the output-array and print info if wanted
     @reset($this->output_array);
     while (list($key) = @each($this->output_array)) {
         if ($key == "requested_url") {
             $str = '<a href="' . $DocInfo->url . '" target=blank>' . $DocInfo->url . '</a>';
             echo "<tr><td width=130><nobr>Page requested:</nobr></td><td width=470>" . $str . "</td></tr>";
         }
         if ($key == "http_status_code") {
             if ($DocInfo->http_status_code) {
                 $str = $DocInfo->http_status_code;
             } else {
                 $str = "-";
             }
             echo "<tr><td>HTTP-Status:</td><td>" . $str . "</td></tr>";
         }
         if ($key == "content_type") {
             if ($DocInfo->content_type) {
                 $str = $DocInfo->content_type;
             } else {
                 $str = "-";
             }
             echo "<tr><td>Content-Type:</td><td>" . $str . "</td></tr>";
         }
         if ($key == "content_size") {
             $str = PHPCrawlerUtils::getHeaderValue($DocInfo->header, "content-length");
             if (trim($str) == "") {
                 $str = "??";
             }
             echo "<tr><td>Content-Size:</td><td >" . $str . " bytes</td></tr>";
         }
         if ($key == "content_received") {
             if ($DocInfo->received == true) {
                 $str = "Yes";
             } else {
                 $str = "No";
             }
             echo "<tr><td>Content received:</td><td >" . $str . "</td></tr>";
         }
         if ($key == "content_received_completely") {
             if ($DocInfo->received_completely == true) {
                 $str = "Yes";
             } else {
                 $str = "No";
             }
             echo "<tr><td><nobr>Received completely:</nobr></td><td >" . $str . "</td></tr>";
         }
         if ($key == "bytes_received") {
             echo "<tr><td>Bytes received:</td><td>" . $DocInfo->bytes_received . " bytes</td></tr>";
         }
         if ($key == "referer_url") {
             if ($DocInfo->referer_url == "") {
                 $str = "-";
             } else {
                 $str =& $page_data["referer_url"];
             }
             echo "<tr><td><nobr>Refering URL</nobr>:</td><td >" . $str . "</td></tr>";
         }
         if ($key == "refering_linkcode") {
             if ($DocInfo->refering_linkcode == "") {
                 $str = "-";
             } else {
                 $str = htmlentities($DocInfo->refering_linkcode);
                 $str = str_replace("\n", "<br>", $str);
             }
             echo "<tr><td valign=top><nobr>Refering linkcode:</nobr></td><td >" . $str . "</td></tr>";
         }
         if ($key == "refering_link_raw") {
             if ($DocInfo->refering_link_raw == "") {
                 $str = "-";
             } else {
                 $str = $DocInfo->refering_link_raw;
             }
             echo "<tr><td><nobr>Refering Link RAW:&nbsp;</nobr></td><td >" . $str . "</td></tr>";
         }
         if ($key == "refering_linktext") {
             if ($DocInfo->refering_linktext == "") {
                 $str = "-";
             } else {
                 $str = $DocInfo->refering_linktext;
                 $str = htmlentities($str);
                 $str = str_replace("\n", "<br>", $str);
             }
             echo "<tr><td valign=top><nobr>Refering linktext</nobr>:</td><td >" . $str . "</td></tr>";
         }
         if ($key == "header_send") {
             if ($DocInfo->header_send) {
                 $str = str_replace("\n", "<br>", trim($DocInfo->header_send));
             } else {
                 $str = "-";
             }
             echo "<tr><td valign=top>Send header:</td><td >" . $str . "</td></tr>";
         }
         if ($key == "header") {
             if ($DocInfo->header) {
                 $str = str_replace("\n", "<br>", trim($DocInfo->header));
             } else {
                 $str = "-";
             }
             echo "<tr><td valign=top>Received header:</td><td >" . $str . "</td></tr>";
         }
         if ($key == "nr_found_links") {
             $str = count($DocInfo->links_found);
             echo "<tr><td valign=top>Links found:</td><td >" . $str . "</td></tr>";
         }
         if ($key == "all_found_links") {
             echo "<tr><td valign=top>List of found links:</td>";
             echo "<td>";
             for ($x = 0; $x < count($DocInfo->links_found_url_descriptors); $x++) {
                 echo $DocInfo->links_found_url_descriptors[$x]->url_rebuild . "<br>";
             }
             if (count($DocInfo->links_found_url_descriptors) == 0) {
                 echo "-";
             }
             echo "</td>";
             echo "</tr>";
         }
         if ($key == "received_to_file") {
             if ($DocInfo->received_to_file) {
                 $str = "Yes";
             } else {
                 $str = "No";
             }
             echo "<tr><td valign=top>Received to TMP-file:</td><td >" . $str . "</td></tr>";
         }
         if ($key == "tmpfile_name_size") {
             if ($DocInfo->content_tmp_file) {
                 $str = $DocInfo->content_tmp_file . " (" . filesize($DocInfo->content_tmp_file) . " bytes)";
             } else {
                 $str = "-";
             }
             echo "<tr><td valign=top>Content TMP-file:</td><td >" . $str . "</td></tr>";
         }
         if ($key == "received_to_memory") {
             if ($DocInfo->received_to_memory) {
                 $str = "Yes";
             } else {
                 $str = "No";
             }
             echo "<tr><td valign=top>Received to memory:</td><td >" . $str . "</td></tr>";
         }
         if ($key == "memory_content_size") {
             echo "<tr><td valign=top>Memory-content-size:</td><td >" . strlen($DocInfo->source) . " bytes</td></tr>";
         }
     }
     // Output error if theres one
     if ($DocInfo->error_occured) {
         echo "<tr>\n            <td class=red>Error:</td>\n            <td class=red>" . $DocInfo->error_string . "</td>\n            </tr>";
     }
     echo "</table> <br>";
     $this->flushOutput();
 }
 /** 
  * Returns the default Robots.txt-URL related to the given URL
  *
  * @param string $url The URL
  * @return string Url of the related robots.txt file
  */
 public static function getRobotsTxtURL($url)
 {
     $url_parts = PHPCrawlerUtils::splitURL($url);
     $robots_txt_url = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . "/robots.txt";
     return $robots_txt_url;
 }