/**
  * Initiates a new PHPCrawlerCookieDescriptor-object.
  *
  * @param string $source_url URL the cookie was send from.
  * @param string $name Cookie-name
  * @param string $value Cookie-value
  * @param string $expires Expire-string, e.g. "Sat, 08-Aug-2020 23:59:08 GMT"
  * @param string $path Cookie-path
  * @param string $domain Cookie-domain
  * @internal
  */
 public function __construct($source_url, $name, $value, $expires = null, $path = null, $domain = null)
 {
     // For cookie-specs, see e.g. http://curl.haxx.se/rfc/cookie_spec.html
     $this->name = $name;
     $this->value = $value;
     $this->expires = $expires;
     $this->path = $path;
     $this->domain = $domain;
     $source_url_parts = PHPCrawlerUtils::splitURL($source_url);
     // Source-domain
     $this->source_domain = $source_url_parts["domain"];
     // Source-URL
     $this->source_url = $source_url;
     // Send-time
     $this->cookie_send_time = PHPCrawlerBenchmark::getmicrotime();
     // Expire-date to timetsamp
     if ($this->expires != null) {
         $this->expire_timestamp = @strtotime($this->expires);
     }
     // If domain doesn't start with "." -> add it (see RFC)
     if ($this->domain != null && substr($this->domain, 0, 1) != ".") {
         $this->domain = "." . $this->domain;
     }
     // Comeplete missing values
     // If domain no set -> domain is the host of the source-url WITHOUT leading "."! (see RFC)
     if ($this->domain == null) {
         $this->domain = $source_url_parts["host"];
     }
     // If path not set
     if ($this->path == null) {
         $this->path = $source_url_parts["path"];
     }
 }
 /**
  * Returns all cookies from the cache that are adressed to the given URL
  *
  * @param string $target_url The target-URL
  * @return array  Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
  */
 public function getCookiesForUrl($target_url)
 {
     $url_parts = PHPCrawlerUtils::splitURL($target_url);
     $target_domain = $url_parts["domain"];
     // e.g. acme.com
     $return_cookies = array();
     // Iterate over all cookies of this domain
     @reset($this->cookies[$target_domain]);
     while (list($hash) = @each($this->cookies[$target_domain])) {
         $Cookie = $this->cookies[$target_domain][$hash];
         // Does the cookie-domain match?
         // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html:
         // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com"
         // Seems like ".acme.com" should also match "anvil.acme.com", so just remove the dot
         $Cookie->domain = preg_replace("#^.#", "", $Cookie->domain);
         if ($Cookie->domain == $url_parts["host"] || preg_match("#" . preg_quote($Cookie->domain) . "\$#", $url_parts["host"])) {
             // Does the path match?
             if (preg_match("#^" . preg_quote($Cookie->path) . "#", $url_parts["path"])) {
                 $return_cookies[$Cookie->name] = $Cookie;
                 // Use cookie-name as index to avoid double-cookies
             }
         }
     }
     // Convert to numeric array
     $return_cookies = array_values($return_cookies);
     return $return_cookies;
 }
 /**
  * Returns the PHPCrawlerUrlPartsDescriptor-object for the given URL.
  *
  * @return PHPCrawlerUrlPartsDescriptor
  */
 public static function fromURL($url)
 {
     $parts = PHPCrawlerUtils::splitURL($url);
     $tmp = new PHPCrawlerUrlPartsDescriptor();
     $tmp->protocol = $parts["protocol"];
     $tmp->host = $parts["host"];
     $tmp->path = $parts["path"];
     $tmp->file = $parts["file"];
     $tmp->domain = $parts["domain"];
     $tmp->port = $parts["port"];
     $tmp->auth_username = $parts["auth_username"];
     $tmp->auth_password = $parts["auth_password"];
     return $tmp;
 }
 /**
  * Checks whether the hostname of the given URL is already cached
  *
  * @param PHPCrawlerURLDescriptor $URL The URL
  * @return bool
  */
 public function urlHostInCache(PHPCrawlerURLDescriptor $URL)
 {
     $url_parts = PHPCrawlerUtils::splitURL($URL->url_rebuild);
     return $this->hostInCache($url_parts["host"]);
 }
Beispiel #5
0
 /**
  * Sets the port to connect to for crawling the starting-url set in setUrl().
  *
  * The default port is 80.
  *
  * Note:
  * <code>
  * $cralwer->setURL("http://www.foo.com");
  * $crawler->setPort(443);
  * </code>
  * effects the same as
  *
  * <code>
  * $cralwer->setURL("http://www.foo.com:443");
  * </code>
  *
  * @param int $port The port
  * @return bool
  * @section 1 Basic settings
  */
 public function setPort($port)
 {
     // Check port
     if (!preg_match("#^[0-9]{1,5}\$#", $port)) {
         return false;
     }
     // Add port to the starting-URL
     $url_parts = PHPCrawlerUtils::splitURL($this->starting_url);
     $url_parts["port"] = $port;
     $this->starting_url = PHPCrawlerUtils::buildURLFromParts($url_parts, true);
     return true;
 }
 /**
  * Checks whether a given URL matches the rules applied to the URLFilter.
  *
  * @param string $url The URL as a PHPCrawlerURLDescriptor-object
  * @return bool TRUE if the URL matches the defined rules.
  */
 protected function urlMatchesRules(PHPCrawlerURLDescriptor $url)
 {
     // URL-parts of the URL to check against the filter-rules
     $url_parts = PHPCrawlerUtils::splitURL($url->url_rebuild);
     // Kick out all links that are NOT of protocol "http" or "https"
     if ($url_parts["protocol"] != "http://" && $url_parts["protocol"] != "https://") {
         return false;
     }
     // Kick out URLs exceeding the maximum crawling-depth
     if ($this->max_crawling_depth !== null && $url->url_link_depth > $this->max_crawling_depth) {
         return false;
     }
     // If meta-tag "robots"->"nofollow" is present and obey_nofollow_tags is TRUE -> always kick out URL
     if ($this->obey_nofollow_tags == true && isset($this->CurrentDocumentInfo->meta_attributes["robots"]) && preg_match("#nofollow# i", $this->CurrentDocumentInfo->meta_attributes["robots"])) {
         return false;
     }
     // If linkcode contains "rel='nofollow'" and obey_nofollow_tags is TRUE -> always kick out URL
     if ($this->obey_nofollow_tags == true) {
         if (preg_match("#^<[^>]*rel\\s*=\\s*(?|\"\\s*nofollow\\s*\"|'\\s*nofollow\\s*'|\\s*nofollow\\s*)[^>]*>#", $url->linkcode)) {
             return false;
         }
     }
     // Filter URLs to other domains if wanted
     if ($this->general_follow_mode >= 1) {
         if ($url_parts["domain"] != $this->starting_url_parts["domain"]) {
             return false;
         }
     }
     // Filter URLs to other hosts if wanted
     if ($this->general_follow_mode >= 2) {
         // Ignore "www." at the beginning of the host, because "www.foo.com" is the same host as "foo.com"
         if (preg_replace("#^www\\.#", "", $url_parts["host"]) != preg_replace("#^www\\.#", "", $this->starting_url_parts["host"])) {
             return false;
         }
     }
     // Filter URLs leading path-up if wanted
     if ($this->general_follow_mode == 3) {
         if ($url_parts["protocol"] != $this->starting_url_parts["protocol"] || preg_replace("#^www\\.#", "", $url_parts["host"]) != preg_replace("#^www\\.#", "", $this->starting_url_parts["host"]) || substr($url_parts["path"], 0, strlen($this->starting_url_parts["path"])) != $this->starting_url_parts["path"]) {
             return false;
         }
     }
     // Filter URLs by url_filter_rules
     for ($x = 0; $x < count($this->url_filter_rules); $x++) {
         if (preg_match($this->url_filter_rules[$x], $url->url_rebuild)) {
             return false;
         }
     }
     // Filter URLs by url_follow_rules
     if (count($this->url_follow_rules) > 0) {
         $match_found = false;
         for ($x = 0; $x < count($this->url_follow_rules); $x++) {
             if (preg_match($this->url_follow_rules[$x], $url->url_rebuild)) {
                 $match_found = true;
                 break;
             }
         }
         if ($match_found == false) {
             return false;
         }
     }
     return true;
 }
 /**
  * Returns the default Robots.txt-URL related to the given URL
  *
  * @param string $url The URL
  * @return string Url of the related robots.txt file
  */
 public static function getRobotsTxtURL($url)
 {
     $url_parts = PHPCrawlerUtils::splitURL($url);
     $robots_txt_url = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . "/robots.txt";
     return $robots_txt_url;
 }
 /**
  * Returns all cookies from the cache that are adressed to the given URL
  *
  * @param string $target_url The target-URL
  * @return array  Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
  */
 public function getCookiesForUrl($target_url)
 {
     PHPCrawlerBenchmark::start("getting_cookies_from_cache");
     $url_parts = PHPCrawlerUtils::splitURL($target_url);
     $return_cookies = array();
     $Result = $this->PDO->query("SELECT * FROM cookies WHERE source_domain = '" . $url_parts["domain"] . "';");
     $rows = $Result->fetchAll(PDO::FETCH_ASSOC);
     $Result->closeCursor();
     $cnt = count($rows);
     for ($x = 0; $x < $cnt; $x++) {
         // Does the cookie-domain match?
         // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html:
         // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com"
         if ($rows[$x]["domain"] == $url_parts["host"] || preg_match("#" . preg_quote($rows[$x]["domain"]) . "\$#", $url_parts["host"])) {
             // Does the path match?
             if (preg_match("#^" . preg_quote($rows[$x]["path"]) . "#", $url_parts["path"])) {
                 $Cookie = new PHPCrawlerCookieDescriptor($rows[$x]["source_url"], $rows[$x]["name"], $rows[$x]["value"], $rows[$x]["expires"], $rows[$x]["path"], $rows[$x]["domain"]);
                 $return_cookies[$Cookie->name] = $Cookie;
                 // Use cookie-name as index to avoid double-cookies
             }
         }
     }
     // Convert to numeric array
     $return_cookies = array_values($return_cookies);
     PHPCrawlerBenchmark::stop("getting_cookies_from_cache");
     return $return_cookies;
 }
 /**
  * Sets the URL for the request.
  *
  * @param PHPCrawlerURLDescriptor $UrlDescriptor An PHPCrawlerURLDescriptor-object containing the URL to request
  */
 public function setUrl(PHPCrawlerURLDescriptor $UrlDescriptor)
 {
     $this->UrlDescriptor = $UrlDescriptor;
     // Split the URL into its parts
     $this->url_parts = PHPCrawlerUtils::splitURL($UrlDescriptor->url_rebuild);
 }