/** * Returns all cookies from the cache that are adressed to the given URL * * @param string $target_url The target-URL * @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects */ public function getCookiesForUrl($target_url) { $url_parts = PHPCrawlerUtils::splitURL($target_url); $target_domain = $url_parts["domain"]; // e.g. acme.com $return_cookies = array(); // Iterate over all cookies of this domain @reset($this->cookies[$target_domain]); while (list($hash) = @each($this->cookies[$target_domain])) { $Cookie = $this->cookies[$target_domain][$hash]; // Does the cookie-domain match? // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html: // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com" // Seems like ".acme.com" should also match "anvil.acme.com", so just remove the dot $Cookie->domain = preg_replace("#^.#", "", $Cookie->domain); if ($Cookie->domain == $url_parts["host"] || preg_match("#" . preg_quote($Cookie->domain) . "\$#", $url_parts["host"])) { // Does the path match? if (preg_match("#^" . preg_quote($Cookie->path) . "#", $url_parts["path"])) { $return_cookies[$Cookie->name] = $Cookie; // Use cookie-name as index to avoid double-cookies } } } // Convert to numeric array $return_cookies = array_values($return_cookies); return $return_cookies; }
/** * Initiates an new PHPCrawlerResponseHeader. * * @param string $header_string A complete response-header as it was send by the server * @param string $source_url The URL of the website the header was recevied from. * @internal */ public function __construct($header_string, $source_url) { $this->header_raw = $header_string; $this->source_url = $source_url; $this->http_status_code = PHPCrawlerUtils::getHTTPStatusCode($header_string); $this->content_type = strtolower(PHPCrawlerUtils::getHeaderValue($header_string, "content-type")); $this->content_length = strtolower(PHPCrawlerUtils::getHeaderValue($header_string, "content-length")); $this->cookies = PHPCrawlerUtils::getCookiesFromHeader($header_string, $source_url); $this->transfer_encoding = strtolower(PHPCrawlerUtils::getHeaderValue($header_string, "transfer-encoding")); $this->content_encoding = strtolower(PHPCrawlerUtils::getHeaderValue($header_string, "content-encoding")); }
/** * Returns the PHPCrawlerUrlPartsDescriptor-object for the given URL. * * @return PHPCrawlerUrlPartsDescriptor */ public static function fromURL($url) { $parts = PHPCrawlerUtils::splitURL($url); $tmp = new PHPCrawlerUrlPartsDescriptor(); $tmp->protocol = $parts["protocol"]; $tmp->host = $parts["host"]; $tmp->path = $parts["path"]; $tmp->file = $parts["file"]; $tmp->domain = $parts["domain"]; $tmp->port = $parts["port"]; $tmp->auth_username = $parts["auth_username"]; $tmp->auth_password = $parts["auth_password"]; return $tmp; }
/** * Adds a rule to the list of rules that decide what kind of documents should get * checked for links in (regarding their content-type) * * @param string $regex Regular-expression defining the rule * @return bool TRUE if the rule was successfully added */ public function addLinkSearchContentType($regex) { $check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern if ($check == true) { $this->linksearch_content_types[] = trim($regex); } return $check; }
/** * Returns all cookies from the cache that are adressed to the given URL * * @param string $target_url The target-URL * @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects */ public function getCookiesForUrl($target_url) { PHPCrawlerBenchmark::start("getting_cookies_from_cache"); $url_parts = PHPCrawlerUtils::splitURL($target_url); $return_cookies = array(); $Result = $this->PDO->query("SELECT * FROM cookies WHERE source_domain = '" . $url_parts["domain"] . "';"); $rows = $Result->fetchAll(PDO::FETCH_ASSOC); $Result->closeCursor(); $cnt = count($rows); for ($x = 0; $x < $cnt; $x++) { // Does the cookie-domain match? // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html: // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com" if ($rows[$x]["domain"] == $url_parts["host"] || preg_match("#" . preg_quote($rows[$x]["domain"]) . "\$#", $url_parts["host"])) { // Does the path match? if (preg_match("#^" . preg_quote($rows[$x]["path"]) . "#", $url_parts["path"])) { $Cookie = new PHPCrawlerCookieDescriptor($rows[$x]["source_url"], $rows[$x]["name"], $rows[$x]["value"], $rows[$x]["expires"], $rows[$x]["path"], $rows[$x]["domain"]); $return_cookies[$Cookie->name] = $Cookie; // Use cookie-name as index to avoid double-cookies } } } // Convert to numeric array $return_cookies = array_values($return_cookies); PHPCrawlerBenchmark::stop("getting_cookies_from_cache"); return $return_cookies; }
/** * Returns the default Robots.txt-URL related to the given URL * * @param string $url The URL * @return string Url of the related robots.txt file */ public static function getRobotsTxtURL($url) { $url_parts = PHPCrawlerUtils::splitURL($url); $robots_txt_url = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . "/robots.txt"; return $robots_txt_url; }
/** * Checks whether the hostname of the given URL is already cached * * @param PHPCrawlerURLDescriptor $URL The URL * @return bool */ public function urlHostInCache(PHPCrawlerURLDescriptor $URL) { $url_parts = PHPCrawlerUtils::splitURL($URL->url_rebuild); return $this->hostInCache($url_parts["host"]); }
/** * Adds a basic-authentication (username and password) to the list of authentications that will be send * with requests. * * @param string $url_regex Regular expression defining the URL(s) the authentication should be send to. * @param string $username The username * @param string $password The password * * @return bool */ public function addBasicAuthentication($url_regex, $username, $password) { // Check regex $regex_okay = PHPCrawlerUtils::checkRegexPattern($url_regex); if ($regex_okay == true) { // Add authentication to basic_authentications-array $tmp = array(); $tmp["url_regex"] = $url_regex; $tmp["username"] = $username; $tmp["password"] = $password; $this->basic_authentications[] = $tmp; return true; } else { return false; } }
/** * Adds a Link-Priority-Level * * @param string $regex * @param int $level */ public function addLinkPriority($regex, $level) { $c = count($this->url_priorities); $this->url_priorities[$c]["match"] = trim($regex); $this->url_priorities[$c]["level"] = trim($level); // Sort url-priortie-array so that high priority-levels come firts. PHPCrawlerUtils::sort2dArray($this->url_priorities, "level", SORT_DESC); }
/** * Adds a link to the LinkFinder-internal link-cache * * @param string $link_raw The link like it was found * @param string $link_code The html-code of the link like it was found (i.e. <a href="the_link.html">Link</a>) * @param string $link_text The linktext like it was found * @param bool $is_redirect_url Flag indicatin whether the found URL is target of an HTTP-redirect */ protected function addLinkToCache($link_raw, $link_code, $link_text = "", $is_redirect_url = false) { //PHPCrawlerBenchmark::start("preparing_link_for_cache"); // If liks already was found and processed -> skip this link if (isset($this->found_links_map[$link_raw])) { return; } // Rebuild URL from link $url_rebuild = PHPCrawlerUtils::buildURLFromLink($link_raw, $this->baseUrlParts); // If link coulnd't be rebuild if ($url_rebuild == null) { return; } // Create an PHPCrawlerURLDescriptor-object with URL-data $url_link_depth = $this->SourceUrl->url_link_depth + 1; $UrlDescriptor = new PHPCrawlerURLDescriptor($url_rebuild, $link_raw, $link_code, $link_text, $this->SourceUrl->url_rebuild, $url_link_depth); // Is redirect-URL? if ($is_redirect_url == true) { $UrlDescriptor->is_redirect_url = true; } // Add the PHPCrawlerURLDescriptor-object to LinkCache $this->LinkCache->addURL($UrlDescriptor); // Add the PHPCrawlerURLDescriptor-object to found-links-array $map_key = $link_raw; $this->found_links_map[$map_key] = true; //PHPCrawlerBenchmark::stop("preparing_link_for_cache"); }
/** * Adds a regular expression togehter with a priority-level to the list of rules that decide what links should be prefered. * * Links/URLs that match an expression with a high priority-level will be followed before links with a lower level. * All links that don't match with any of the given rules will get the level 0 (lowest level) automatically. * * The level can be any positive integer. * * <b>Example:</b> * * Telling the crawler to follow links that contain the string "forum" before links that contain ".gif" before all other found links. * <code> * $crawler->addLinkPriority("/forum/", 10); * $cralwer->addLinkPriority("/\.gif/", 5); * </code> * * @param string $regex Regular expression definig the rule * @param int $level The priority-level * * @return bool TRUE if a valid preg-pattern is given as argument and was succsessfully added, otherwise it returns FALSE. * @section 10 Other settings */ function addLinkPriority($regex, $level) { $check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern if ($check == true && preg_match("/^[0-9]*\$/", $level)) { $c = count($this->link_priority_array); $this->link_priority_array[$c]["match"] = trim($regex); $this->link_priority_array[$c]["level"] = trim($level); return true; } else { return false; } }
/** * Sets/writes the current crawler-status * * @param PHPCrawlerStatus $crawler_status The status to set */ public function setCrawlerStatus(PHPCrawlerStatus $crawler_status) { $this->crawlerStatus = $crawler_status; // Write crawler-status back to file if ($this->write_status_to_file == true) { PHPCrawlerUtils::serializeToFile($this->working_directory . "crawlerstatus.tmp", $crawler_status); } }
/** * Adds a rule to the list of rules that decide which URLs found on a page should be ignored by the crawler. */ public function addURLFilterRule($regex) { $check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern if ($check == true) { $this->url_filter_rules[] = trim($regex); } return $check; }
/** * Initiates a new PHPCrawlerCookieDescriptor-object. * * @param string $source_url URL the cookie was send from. * @param string $name Cookie-name * @param string $value Cookie-value * @param string $expires Expire-string, e.g. "Sat, 08-Aug-2020 23:59:08 GMT" * @param string $path Cookie-path * @param string $domain Cookie-domain * @internal */ public function __construct($source_url, $name, $value, $expires = null, $path = null, $domain = null) { // For cookie-specs, see e.g. http://curl.haxx.se/rfc/cookie_spec.html $this->name = $name; $this->value = $value; $this->expires = $expires; $this->path = $path; $this->domain = $domain; $source_url_parts = PHPCrawlerUtils::splitURL($source_url); // Source-domain $this->source_domain = $source_url_parts["domain"]; // Source-URL $this->source_url = $source_url; // Send-time $this->cookie_send_time = PHPCrawlerBenchmark::getmicrotime(); // Expire-date to timetsamp if ($this->expires != null) { $this->expire_timestamp = @strtotime($this->expires); } // If domain doesn't start with "." -> add it (see RFC) if ($this->domain != null && substr($this->domain, 0, 1) != ".") { $this->domain = "." . $this->domain; } // Comeplete missing values // If domain no set -> domain is the host of the source-url WITHOUT leading "."! (see RFC) if ($this->domain == null) { $this->domain = $source_url_parts["host"]; } // If path not set if ($this->path == null) { $this->path = $source_url_parts["path"]; } }