/** * Returns all cookies from the cache that are adressed to the given URL * * @param string $target_url The target-URL * @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects */ public function getCookiesForUrl($target_url) { $url_parts = PHPCrawlerUtils::splitURL($target_url); $target_domain = $url_parts["domain"]; // e.g. acme.com $return_cookies = array(); // Iterate over all cookies of this domain @reset($this->cookies[$target_domain]); while (list($hash) = @each($this->cookies[$target_domain])) { $Cookie = $this->cookies[$target_domain][$hash]; // Does the cookie-domain match? // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html: // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com" // Seems like ".acme.com" should also match "anvil.acme.com", so just remove the dot $Cookie->domain = preg_replace("#^.#", "", $Cookie->domain); if ($Cookie->domain == $url_parts["host"] || preg_match("#" . preg_quote($Cookie->domain) . "\$#", $url_parts["host"])) { // Does the path match? if (preg_match("#^" . preg_quote($Cookie->path) . "#", $url_parts["path"])) { $return_cookies[$Cookie->name] = $Cookie; // Use cookie-name as index to avoid double-cookies } } } // Convert to numeric array $return_cookies = array_values($return_cookies); return $return_cookies; }
/** * Returns the PHPCrawlerUrlPartsDescriptor-object for the given URL. * * @return PHPCrawlerUrlPartsDescriptor */ public static function fromURL($url) { $parts = PHPCrawlerUtils::splitURL($url); $tmp = new PHPCrawlerUrlPartsDescriptor(); $tmp->protocol = $parts["protocol"]; $tmp->host = $parts["host"]; $tmp->path = $parts["path"]; $tmp->file = $parts["file"]; $tmp->domain = $parts["domain"]; $tmp->port = $parts["port"]; $tmp->auth_username = $parts["auth_username"]; $tmp->auth_password = $parts["auth_password"]; return $tmp; }
/** * Sets the URL for the request. * * @param PHPCrawlerURLDescriptor $UrlDescriptor An PHPCrawlerURLDescriptor-object containing the URL to request */ public function setUrl(PHPCrawlerURLDescriptor $UrlDescriptor) { $this->UrlDescriptor = $UrlDescriptor; // Split the URL into its parts $this->url_parts = PHPCrawlerUtils::splitURL($UrlDescriptor->url_rebuild); }
/** * Returns all cookies from the cache that are adressed to the given URL * * @param string $target_url The target-URL * @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects */ public function getCookiesForUrl($target_url) { PHPCrawlerBenchmark::start("getting_cookies_from_cache"); $url_parts = PHPCrawlerUtils::splitURL($target_url); $return_cookies = array(); $Result = $this->PDO->query("SELECT * FROM cookies WHERE source_domain = '" . $url_parts["domain"] . "';"); $rows = $Result->fetchAll(PDO::FETCH_ASSOC); $Result->closeCursor(); $cnt = count($rows); for ($x = 0; $x < $cnt; $x++) { // Does the cookie-domain match? // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html: // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com" if ($rows[$x]["domain"] == $url_parts["host"] || preg_match("#" . preg_quote($rows[$x]["domain"]) . "\$#", $url_parts["host"])) { // Does the path match? if (preg_match("#^" . preg_quote($rows[$x]["path"]) . "#", $url_parts["path"])) { $Cookie = new PHPCrawlerCookieDescriptor($rows[$x]["source_url"], $rows[$x]["name"], $rows[$x]["value"], $rows[$x]["expires"], $rows[$x]["path"], $rows[$x]["domain"]); $return_cookies[$Cookie->name] = $Cookie; // Use cookie-name as index to avoid double-cookies } } } // Convert to numeric array $return_cookies = array_values($return_cookies); PHPCrawlerBenchmark::stop("getting_cookies_from_cache"); return $return_cookies; }
/** * Returns the default Robots.txt-URL related to the given URL * * @param string $url The URL * @return string Url of the related robots.txt file */ public static function getRobotsTxtURL($url) { $url_parts = PHPCrawlerUtils::splitURL($url); $robots_txt_url = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . "/robots.txt"; return $robots_txt_url; }
/** * Checks whether the hostname of the given URL is already cached * * @param PHPCrawlerURLDescriptor $URL The URL * @return bool */ public function urlHostInCache(PHPCrawlerURLDescriptor $URL) { $url_parts = PHPCrawlerUtils::splitURL($URL->url_rebuild); return $this->hostInCache($url_parts["host"]); }
/** * Sets the port to connect to for crawling the starting-url set in setUrl(). * * The default port is 80. * * Note: * <code> * $cralwer->setURL("http://www.foo.com"); * $crawler->setPort(443); * </code> * effects the same as * * <code> * $cralwer->setURL("http://www.foo.com:443"); * </code> * * @param int $port The port * @return bool * @section 1 Basic settings */ public function setPort($port) { // Check port if (!preg_match("#^[0-9]{1,5}\$#", $port)) { return false; } // Add port to the starting-URL $url_parts = PHPCrawlerUtils::splitURL($this->starting_url); $url_parts["port"] = $port; $this->starting_url = PHPCrawlerUtils::buildURLFromParts($url_parts, true); return true; }
/** * Checks whether a given URL matches the rules applied to the URLFilter. * * @param string $url The URL as a PHPCrawlerURLDescriptor-object * @return bool TRUE if the URL matches the defined rules. */ protected function urlMatchesRules(PHPCrawlerURLDescriptor $url) { // URL-parts of the URL to check against the filter-rules $url_parts = PHPCrawlerUtils::splitURL($url->url_rebuild); // Kick out all links that are NOT of protocol "http" or "https" if ($url_parts["protocol"] != "http://" && $url_parts["protocol"] != "https://") { return false; } // Kick out URLs exceeding the maximum crawling-depth if ($this->max_crawling_depth !== null && $url->url_link_depth > $this->max_crawling_depth) { return false; } // If meta-tag "robots"->"nofollow" is present and obey_nofollow_tags is TRUE -> always kick out URL if ($this->obey_nofollow_tags == true && isset($this->CurrentDocumentInfo->meta_attributes["robots"]) && preg_match("#nofollow# i", $this->CurrentDocumentInfo->meta_attributes["robots"])) { return false; } // If linkcode contains "rel='nofollow'" and obey_nofollow_tags is TRUE -> always kick out URL if ($this->obey_nofollow_tags == true) { if (preg_match("#^<[^>]*rel\\s*=\\s*(?|\"\\s*nofollow\\s*\"|'\\s*nofollow\\s*'|\\s*nofollow\\s*)[^>]*>#", $url->linkcode)) { return false; } } // Filter URLs to other domains if wanted if ($this->general_follow_mode >= 1) { if ($url_parts["domain"] != $this->starting_url_parts["domain"]) { return false; } } // Filter URLs to other hosts if wanted if ($this->general_follow_mode >= 2) { // Ignore "www." at the beginning of the host, because "www.foo.com" is the same host as "foo.com" if (preg_replace("#^www\\.#", "", $url_parts["host"]) != preg_replace("#^www\\.#", "", $this->starting_url_parts["host"])) { return false; } } // Filter URLs leading path-up if wanted if ($this->general_follow_mode == 3) { if ($url_parts["protocol"] != $this->starting_url_parts["protocol"] || preg_replace("#^www\\.#", "", $url_parts["host"]) != preg_replace("#^www\\.#", "", $this->starting_url_parts["host"]) || substr($url_parts["path"], 0, strlen($this->starting_url_parts["path"])) != $this->starting_url_parts["path"]) { return false; } } // Filter URLs by url_filter_rules for ($x = 0; $x < count($this->url_filter_rules); $x++) { if (preg_match($this->url_filter_rules[$x], $url->url_rebuild)) { return false; } } // Filter URLs by url_follow_rules if (count($this->url_follow_rules) > 0) { $match_found = false; for ($x = 0; $x < count($this->url_follow_rules); $x++) { if (preg_match($this->url_follow_rules[$x], $url->url_rebuild)) { $match_found = true; break; } } if ($match_found == false) { return false; } } return true; }
/** * Initiates a new PHPCrawlerCookieDescriptor-object. * * @param string $source_url URL the cookie was send from. * @param string $name Cookie-name * @param string $value Cookie-value * @param string $expires Expire-string, e.g. "Sat, 08-Aug-2020 23:59:08 GMT" * @param string $path Cookie-path * @param string $domain Cookie-domain * @internal */ public function __construct($source_url, $name, $value, $expires = null, $path = null, $domain = null) { // For cookie-specs, see e.g. http://curl.haxx.se/rfc/cookie_spec.html $this->name = $name; $this->value = $value; $this->expires = $expires; $this->path = $path; $this->domain = $domain; $source_url_parts = PHPCrawlerUtils::splitURL($source_url); // Source-domain $this->source_domain = $source_url_parts["domain"]; // Source-URL $this->source_url = $source_url; // Send-time $this->cookie_send_time = PHPCrawlerBenchmark::getmicrotime(); // Expire-date to timetsamp if ($this->expires != null) { $this->expire_timestamp = @strtotime($this->expires); } // If domain doesn't start with "." -> add it (see RFC) if ($this->domain != null && substr($this->domain, 0, 1) != ".") { $this->domain = "." . $this->domain; } // Comeplete missing values // If domain no set -> domain is the host of the source-url WITHOUT leading "."! (see RFC) if ($this->domain == null) { $this->domain = $source_url_parts["host"]; } // If path not set if ($this->path == null) { $this->path = $source_url_parts["path"]; } }