/**
  * Returns all cookies from the cache that are adressed to the given URL
  *
  * @param string $target_url The target-URL
  * @return array  Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
  */
 public function getCookiesForUrl($target_url)
 {
     $url_parts = PHPCrawlerUtils::splitURL($target_url);
     $target_domain = $url_parts["domain"];
     // e.g. acme.com
     $return_cookies = array();
     // Iterate over all cookies of this domain
     @reset($this->cookies[$target_domain]);
     while (list($hash) = @each($this->cookies[$target_domain])) {
         $Cookie = $this->cookies[$target_domain][$hash];
         // Does the cookie-domain match?
         // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html:
         // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com"
         // Seems like ".acme.com" should also match "anvil.acme.com", so just remove the dot
         $Cookie->domain = preg_replace("#^.#", "", $Cookie->domain);
         if ($Cookie->domain == $url_parts["host"] || preg_match("#" . preg_quote($Cookie->domain) . "\$#", $url_parts["host"])) {
             // Does the path match?
             if (preg_match("#^" . preg_quote($Cookie->path) . "#", $url_parts["path"])) {
                 $return_cookies[$Cookie->name] = $Cookie;
                 // Use cookie-name as index to avoid double-cookies
             }
         }
     }
     // Convert to numeric array
     $return_cookies = array_values($return_cookies);
     return $return_cookies;
 }
 /**
  * Initiates an new PHPCrawlerResponseHeader.
  *
  * @param string $header_string A complete response-header as it was send by the server
  * @param string $source_url    The URL of the website the header was recevied from.
  * @internal
  */
 public function __construct($header_string, $source_url)
 {
     $this->header_raw = $header_string;
     $this->source_url = $source_url;
     $this->http_status_code = PHPCrawlerUtils::getHTTPStatusCode($header_string);
     $this->content_type = strtolower(PHPCrawlerUtils::getHeaderValue($header_string, "content-type"));
     $this->content_length = strtolower(PHPCrawlerUtils::getHeaderValue($header_string, "content-length"));
     $this->cookies = PHPCrawlerUtils::getCookiesFromHeader($header_string, $source_url);
     $this->transfer_encoding = strtolower(PHPCrawlerUtils::getHeaderValue($header_string, "transfer-encoding"));
     $this->content_encoding = strtolower(PHPCrawlerUtils::getHeaderValue($header_string, "content-encoding"));
 }
 /**
  * Returns the PHPCrawlerUrlPartsDescriptor-object for the given URL.
  *
  * @return PHPCrawlerUrlPartsDescriptor
  */
 public static function fromURL($url)
 {
     $parts = PHPCrawlerUtils::splitURL($url);
     $tmp = new PHPCrawlerUrlPartsDescriptor();
     $tmp->protocol = $parts["protocol"];
     $tmp->host = $parts["host"];
     $tmp->path = $parts["path"];
     $tmp->file = $parts["file"];
     $tmp->domain = $parts["domain"];
     $tmp->port = $parts["port"];
     $tmp->auth_username = $parts["auth_username"];
     $tmp->auth_password = $parts["auth_password"];
     return $tmp;
 }
 /**
  * Adds a rule to the list of rules that decide what kind of documents should get
  * checked for links in (regarding their content-type)
  *
  * @param string $regex Regular-expression defining the rule
  * @return bool         TRUE if the rule was successfully added
  */
 public function addLinkSearchContentType($regex)
 {
     $check = PHPCrawlerUtils::checkRegexPattern($regex);
     // Check pattern
     if ($check == true) {
         $this->linksearch_content_types[] = trim($regex);
     }
     return $check;
 }
 /**
  * Returns all cookies from the cache that are adressed to the given URL
  *
  * @param string $target_url The target-URL
  * @return array  Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
  */
 public function getCookiesForUrl($target_url)
 {
     PHPCrawlerBenchmark::start("getting_cookies_from_cache");
     $url_parts = PHPCrawlerUtils::splitURL($target_url);
     $return_cookies = array();
     $Result = $this->PDO->query("SELECT * FROM cookies WHERE source_domain = '" . $url_parts["domain"] . "';");
     $rows = $Result->fetchAll(PDO::FETCH_ASSOC);
     $Result->closeCursor();
     $cnt = count($rows);
     for ($x = 0; $x < $cnt; $x++) {
         // Does the cookie-domain match?
         // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html:
         // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com"
         if ($rows[$x]["domain"] == $url_parts["host"] || preg_match("#" . preg_quote($rows[$x]["domain"]) . "\$#", $url_parts["host"])) {
             // Does the path match?
             if (preg_match("#^" . preg_quote($rows[$x]["path"]) . "#", $url_parts["path"])) {
                 $Cookie = new PHPCrawlerCookieDescriptor($rows[$x]["source_url"], $rows[$x]["name"], $rows[$x]["value"], $rows[$x]["expires"], $rows[$x]["path"], $rows[$x]["domain"]);
                 $return_cookies[$Cookie->name] = $Cookie;
                 // Use cookie-name as index to avoid double-cookies
             }
         }
     }
     // Convert to numeric array
     $return_cookies = array_values($return_cookies);
     PHPCrawlerBenchmark::stop("getting_cookies_from_cache");
     return $return_cookies;
 }
 /**
  * Returns the default Robots.txt-URL related to the given URL
  *
  * @param string $url The URL
  * @return string Url of the related robots.txt file
  */
 public static function getRobotsTxtURL($url)
 {
     $url_parts = PHPCrawlerUtils::splitURL($url);
     $robots_txt_url = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . "/robots.txt";
     return $robots_txt_url;
 }
 /**
  * Checks whether the hostname of the given URL is already cached
  *
  * @param PHPCrawlerURLDescriptor $URL The URL
  * @return bool
  */
 public function urlHostInCache(PHPCrawlerURLDescriptor $URL)
 {
     $url_parts = PHPCrawlerUtils::splitURL($URL->url_rebuild);
     return $this->hostInCache($url_parts["host"]);
 }
 /**
  * Adds a basic-authentication (username and password) to the list of authentications that will be send
  * with requests.
  *
  * @param string $url_regex Regular expression defining the URL(s) the authentication should be send to.
  * @param string $username  The username
  * @param string $password  The password
  *
  * @return bool
  */
 public function addBasicAuthentication($url_regex, $username, $password)
 {
     // Check regex
     $regex_okay = PHPCrawlerUtils::checkRegexPattern($url_regex);
     if ($regex_okay == true) {
         // Add authentication to basic_authentications-array
         $tmp = array();
         $tmp["url_regex"] = $url_regex;
         $tmp["username"] = $username;
         $tmp["password"] = $password;
         $this->basic_authentications[] = $tmp;
         return true;
     } else {
         return false;
     }
 }
 /**
  * Adds a Link-Priority-Level
  *
  * @param string $regex
  * @param int    $level
  */
 public function addLinkPriority($regex, $level)
 {
     $c = count($this->url_priorities);
     $this->url_priorities[$c]["match"] = trim($regex);
     $this->url_priorities[$c]["level"] = trim($level);
     // Sort url-priortie-array so that high priority-levels come firts.
     PHPCrawlerUtils::sort2dArray($this->url_priorities, "level", SORT_DESC);
 }
 /**
  * Adds a link to the LinkFinder-internal link-cache
  *
  * @param string $link_raw        The link like it was found
  * @param string $link_code       The html-code of the link like it was found (i.e. <a href="the_link.html">Link</a>)
  * @param string $link_text       The linktext like it was found
  * @param bool   $is_redirect_url Flag indicatin whether the found URL is target of an HTTP-redirect
  */
 protected function addLinkToCache($link_raw, $link_code, $link_text = "", $is_redirect_url = false)
 {
     //PHPCrawlerBenchmark::start("preparing_link_for_cache");
     // If liks already was found and processed -> skip this link
     if (isset($this->found_links_map[$link_raw])) {
         return;
     }
     // Rebuild URL from link
     $url_rebuild = PHPCrawlerUtils::buildURLFromLink($link_raw, $this->baseUrlParts);
     // If link coulnd't be rebuild
     if ($url_rebuild == null) {
         return;
     }
     // Create an PHPCrawlerURLDescriptor-object with URL-data
     $url_link_depth = $this->SourceUrl->url_link_depth + 1;
     $UrlDescriptor = new PHPCrawlerURLDescriptor($url_rebuild, $link_raw, $link_code, $link_text, $this->SourceUrl->url_rebuild, $url_link_depth);
     // Is redirect-URL?
     if ($is_redirect_url == true) {
         $UrlDescriptor->is_redirect_url = true;
     }
     // Add the PHPCrawlerURLDescriptor-object to LinkCache
     $this->LinkCache->addURL($UrlDescriptor);
     // Add the PHPCrawlerURLDescriptor-object to found-links-array
     $map_key = $link_raw;
     $this->found_links_map[$map_key] = true;
     //PHPCrawlerBenchmark::stop("preparing_link_for_cache");
 }
Ejemplo n.º 11
0
 /**
  * Adds a regular expression togehter with a priority-level to the list of rules that decide what links should be prefered.
  *
  * Links/URLs that match an expression with a high priority-level will be followed before links with a lower level.
  * All links that don't match with any of the given rules will get the level 0 (lowest level) automatically.
  *
  * The level can be any positive integer.
  *
  * <b>Example:</b>
  *
  * Telling the crawler to follow links that contain the string "forum" before links that contain ".gif" before all other found links.
  * <code>
  * $crawler->addLinkPriority("/forum/", 10);
  * $cralwer->addLinkPriority("/\.gif/", 5);
  * </code>
  *
  * @param string $regex  Regular expression definig the rule
  * @param int    $level  The priority-level
  *
  * @return bool  TRUE if a valid preg-pattern is given as argument and was succsessfully added, otherwise it returns FALSE.
  * @section 10 Other settings
  */
 function addLinkPriority($regex, $level)
 {
     $check = PHPCrawlerUtils::checkRegexPattern($regex);
     // Check pattern
     if ($check == true && preg_match("/^[0-9]*\$/", $level)) {
         $c = count($this->link_priority_array);
         $this->link_priority_array[$c]["match"] = trim($regex);
         $this->link_priority_array[$c]["level"] = trim($level);
         return true;
     } else {
         return false;
     }
 }
 /**
  * Sets/writes the current crawler-status
  *
  * @param PHPCrawlerStatus $crawler_status The status to set
  */
 public function setCrawlerStatus(PHPCrawlerStatus $crawler_status)
 {
     $this->crawlerStatus = $crawler_status;
     // Write crawler-status back to file
     if ($this->write_status_to_file == true) {
         PHPCrawlerUtils::serializeToFile($this->working_directory . "crawlerstatus.tmp", $crawler_status);
     }
 }
 /**
  * Adds a rule to the list of rules that decide which URLs found on a page should be ignored by the crawler.
  */
 public function addURLFilterRule($regex)
 {
     $check = PHPCrawlerUtils::checkRegexPattern($regex);
     // Check pattern
     if ($check == true) {
         $this->url_filter_rules[] = trim($regex);
     }
     return $check;
 }
 /**
  * Initiates a new PHPCrawlerCookieDescriptor-object.
  *
  * @param string $source_url URL the cookie was send from.
  * @param string $name       Cookie-name
  * @param string $value      Cookie-value
  * @param string $expires    Expire-string, e.g. "Sat, 08-Aug-2020 23:59:08 GMT"
  * @param string $path       Cookie-path
  * @param string $domain     Cookie-domain
  * @internal
  */
 public function __construct($source_url, $name, $value, $expires = null, $path = null, $domain = null)
 {
     // For cookie-specs, see e.g. http://curl.haxx.se/rfc/cookie_spec.html
     $this->name = $name;
     $this->value = $value;
     $this->expires = $expires;
     $this->path = $path;
     $this->domain = $domain;
     $source_url_parts = PHPCrawlerUtils::splitURL($source_url);
     // Source-domain
     $this->source_domain = $source_url_parts["domain"];
     // Source-URL
     $this->source_url = $source_url;
     // Send-time
     $this->cookie_send_time = PHPCrawlerBenchmark::getmicrotime();
     // Expire-date to timetsamp
     if ($this->expires != null) {
         $this->expire_timestamp = @strtotime($this->expires);
     }
     // If domain doesn't start with "." -> add it (see RFC)
     if ($this->domain != null && substr($this->domain, 0, 1) != ".") {
         $this->domain = "." . $this->domain;
     }
     // Comeplete missing values
     // If domain no set -> domain is the host of the source-url WITHOUT leading "."! (see RFC)
     if ($this->domain == null) {
         $this->domain = $source_url_parts["host"];
     }
     // If path not set
     if ($this->path == null) {
         $this->path = $source_url_parts["path"];
     }
 }