/**
  * Returns all cookies from the cache that are adressed to the given URL
  *
  * @param string $target_url The target-URL
  * @return array  Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
  */
 public function getCookiesForUrl($target_url)
 {
     $url_parts = PHPCrawlerUtils::splitURL($target_url);
     $target_domain = $url_parts["domain"];
     // e.g. acme.com
     $return_cookies = array();
     // Iterate over all cookies of this domain
     @reset($this->cookies[$target_domain]);
     while (list($hash) = @each($this->cookies[$target_domain])) {
         $Cookie = $this->cookies[$target_domain][$hash];
         // Does the cookie-domain match?
         // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html:
         // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com"
         // Seems like ".acme.com" should also match "anvil.acme.com", so just remove the dot
         $Cookie->domain = preg_replace("#^.#", "", $Cookie->domain);
         if ($Cookie->domain == $url_parts["host"] || preg_match("#" . preg_quote($Cookie->domain) . "\$#", $url_parts["host"])) {
             // Does the path match?
             if (preg_match("#^" . preg_quote($Cookie->path) . "#", $url_parts["path"])) {
                 $return_cookies[$Cookie->name] = $Cookie;
                 // Use cookie-name as index to avoid double-cookies
             }
         }
     }
     // Convert to numeric array
     $return_cookies = array_values($return_cookies);
     return $return_cookies;
 }
 /**
  * Initiates a new PHPCrawlerCookieDescriptor-object.
  *
  * @param string $source_url URL the cookie was send from.
  * @param string $name Cookie-name
  * @param string $value Cookie-value
  * @param string $expires Expire-string, e.g. "Sat, 08-Aug-2020 23:59:08 GMT"
  * @param string $path Cookie-path
  * @param string $domain Cookie-domain
  * @internal
  */
 public function __construct($source_url, $name, $value, $expires = null, $path = null, $domain = null)
 {
     // For cookie-specs, see e.g. http://curl.haxx.se/rfc/cookie_spec.html
     $this->name = $name;
     $this->value = $value;
     $this->expires = $expires;
     $this->path = $path;
     $this->domain = $domain;
     $source_url_parts = PHPCrawlerUtils::splitURL($source_url);
     // Source-domain
     $this->source_domain = $source_url_parts["domain"];
     // Source-URL
     $this->source_url = $source_url;
     // Send-time
     $this->cookie_send_time = PHPCrawlerBenchmark::getmicrotime();
     // Expire-date to timetsamp
     if ($this->expires != null) {
         $this->expire_timestamp = @strtotime($this->expires);
     }
     // If domain doesn't start with "." -> add it (see RFC)
     if ($this->domain != null && substr($this->domain, 0, 1) != ".") {
         $this->domain = "." . $this->domain;
     }
     // Comeplete missing values
     // If domain no set -> domain is the host of the source-url WITHOUT leading "."! (see RFC)
     if ($this->domain == null) {
         $this->domain = $source_url_parts["host"];
     }
     // If path not set
     if ($this->path == null) {
         $this->path = $source_url_parts["path"];
     }
 }
 /**
  * Initiates an new PHPCrawlerResponseHeader.
  *
  * @param string $header_string A complete response-header as it was send by the server
  * @param string $source_url The URL of the website the header was recevied from.
  * @internal
  */
 public function __construct($header_string, $source_url)
 {
     $this->header_raw = $header_string;
     $this->source_url = $source_url;
     $this->http_status_code = PHPCrawlerUtils::getHTTPStatusCode($header_string);
     $this->content_type = strtolower(PHPCrawlerUtils::getHeaderValue($header_string, "content-type"));
     $this->content_length = strtolower(PHPCrawlerUtils::getHeaderValue($header_string, "content-length"));
     $this->cookies = PHPCrawlerUtils::getCookiesFromHeader($header_string, $source_url);
     $this->transfer_encoding = strtolower(PHPCrawlerUtils::getHeaderValue($header_string, "transfer-encoding"));
     $this->content_encoding = strtolower(PHPCrawlerUtils::getHeaderValue($header_string, "content-encoding"));
 }
 /**
  * Returns the PHPCrawlerUrlPartsDescriptor-object for the given URL.
  *
  * @return PHPCrawlerUrlPartsDescriptor
  */
 public static function fromURL($url)
 {
     $parts = PHPCrawlerUtils::splitURL($url);
     $tmp = new PHPCrawlerUrlPartsDescriptor();
     $tmp->protocol = $parts["protocol"];
     $tmp->host = $parts["host"];
     $tmp->path = $parts["path"];
     $tmp->file = $parts["file"];
     $tmp->domain = $parts["domain"];
     $tmp->port = $parts["port"];
     $tmp->auth_username = $parts["auth_username"];
     $tmp->auth_password = $parts["auth_password"];
     return $tmp;
 }
 /**
  * Adds a basic-authentication (username and password) to the list of authentications that will be send
  * with requests.
  *
  * @param string $url_regex Regular expression defining the URL(s) the authentication should be send to.
  * @param string $username The username
  * @param string $password The password
  *
  * @return bool
  */
 public function addBasicAuthentication($url_regex, $username, $password)
 {
     // Check regex
     $regex_okay = PHPCrawlerUtils::checkRegexPattern($url_regex);
     if ($regex_okay == true) {
         // Add authentication to basic_authentications-array
         $tmp = array();
         $tmp["url_regex"] = $url_regex;
         $tmp["username"] = $username;
         $tmp["password"] = $password;
         $this->basic_authentications[] = $tmp;
         return true;
     } else {
         return false;
     }
 }
 /**
  * Searches for links in the given HTML-chunk and adds found links the the internal link-cache.
  */
 public function findLinksInHTMLChunk(&$html_source)
 {
     PHPCrawlerBenchmark::start("searching_for_links_in_page");
     // Check for meta-base-URL and meta-tags in top of HTML-source
     if ($this->top_lines_processed == false) {
         $meta_base_url = PHPCrawlerUtils::getBaseUrlFromMetaTag($html_source);
         if ($meta_base_url != null) {
             $base_url = PHPCrawlerUtils::buildURLFromLink($meta_base_url, $this->baseUrlParts);
             $this->baseUrlParts = PHPCrawlerUrlPartsDescriptor::fromURL($base_url);
         }
         // Get all meta-tags
         $this->meta_attributes = PHPCrawlerUtils::getMetaTagAttributes($html_source);
         // Set flag that top-lines of source were processed
         $this->top_lines_processed = true;
     }
     // Prepare HTML-chunk
     $this->prepareHTMLChunk($html_source);
     // Build the RegEx-part for html-tags to search links in
     $tag_regex_part = "";
     $cnt = count($this->extract_tags);
     for ($x = 0; $x < $cnt; $x++) {
         $tag_regex_part .= "|" . $this->extract_tags[$x];
     }
     $tag_regex_part = substr($tag_regex_part, 1);
     // 1. <a href="...">LINKTEXT</a> (well formed link with </a> at the end and quotes around the link)
     // Get the link AND the linktext from these tags
     // This has to be done FIRST !!
     preg_match_all("#<\\s*a\\s[^<>]*(?<=\\s)(?:" . $tag_regex_part . ")\\s*=\\s*" . "(?|\"([^\"]+)\"|'([^']+)'|([^\\s><'\"]+))[^<>]*>" . "((?:(?!<\\s*\\/a\\s*>).){0,500})" . "<\\s*\\/a\\s*># is", $html_source, $matches);
     $cnt = count($matches[0]);
     for ($x = 0; $x < $cnt; $x++) {
         $link_raw = trim($matches[1][$x]);
         $linktext = $matches[2][$x];
         $linkcode = trim($matches[0][$x]);
         if (!empty($link_raw)) {
             $this->addLinkToCache($link_raw, $linkcode, $linktext);
         }
     }
     // Second regex (everything that could be a link inside of <>-tags)
     preg_match_all("#<[^<>]*\\s(?:" . $tag_regex_part . ")\\s*=\\s*" . "(?|\"([^\"]+)\"|'([^']+)'|([^\\s><'\"]+))[^<>]*># is", $html_source, $matches);
     $cnt = count($matches[0]);
     for ($x = 0; $x < $cnt; $x++) {
         $link_raw = trim($matches[1][$x]);
         $linktext = "";
         $linkcode = trim($matches[0][$x]);
         if (!empty($link_raw)) {
             $this->addLinkToCache($link_raw, $linkcode, $linktext);
         }
     }
     // Now, if agressive_mode is set to true, we look for some
     // other things
     $pregs = array();
     if ($this->aggressive_search == true) {
         // Links like "...:url("animage.gif")..."
         $pregs[] = "/[\\s\\.:;](?:" . $tag_regex_part . ")\\s*\\(\\s*([\"|']{0,1})([^\"'\\) ]{1,500})['\"\\)]/ is";
         // Everything like "...href="bla.html"..." with qoutes
         $pregs[] = "/[\\s\\.:;\"'](?:" . $tag_regex_part . ")\\s*=\\s*([\"|'])(.{0,500}?)\\1/ is";
         // Everything like "...href=bla.html..." without qoutes
         $pregs[] = "/[\\s\\.:;](?:" . $tag_regex_part . ")\\s*(=)\\s*([^\\s\">']{1,500})/ is";
         for ($x = 0; $x < count($pregs); $x++) {
             unset($matches);
             preg_match_all($pregs[$x], $html_source, $matches);
             $cnt = count($matches[0]);
             for ($y = 0; $y < $cnt; $y++) {
                 $link_raw = trim($matches[2][$y]);
                 $linkcode = trim($matches[0][$y]);
                 $linktext = "";
                 $this->addLinkToCache($link_raw, $linkcode, $linktext);
             }
         }
     }
     $this->found_links_map = array();
     PHPCrawlerBenchmark::stop("searching_for_links_in_page");
 }
 /**
  * Adds a Link-Priority-Level
  *
  * @param string $regex
  * @param int $level
  */
 public function addLinkPriority($regex, $level)
 {
     $c = count($this->url_priorities);
     $this->url_priorities[$c]["match"] = trim($regex);
     $this->url_priorities[$c]["level"] = trim($level);
     // Sort url-priortie-array so that high priority-levels come firts.
     PHPCrawlerUtils::sort2dArray($this->url_priorities, "level", SORT_DESC);
 }
Example #8
0
 /**
  * Checks whether the hostname of the given URL is already cached
  *
  * @param PHPCrawlerURLDescriptor $URL The URL
  * @return bool
  */
 public function urlHostInCache(PHPCrawlerURLDescriptor $URL)
 {
     $url_parts = PHPCrawlerUtils::splitURL($URL->url_rebuild);
     return $this->hostInCache($url_parts["host"]);
 }
Example #9
0
 /**
  * Adds a regular expression togehter with a priority-level to the list of rules that decide what links should be prefered.
  *
  * Links/URLs that match an expression with a high priority-level will be followed before links with a lower level.
  * All links that don't match with any of the given rules will get the level 0 (lowest level) automatically.
  *
  * The level can be any positive integer.
  *
  * <b>Example:</b>
  *
  * Telling the crawler to follow links that contain the string "forum" before links that contain ".gif" before all other found links.
  * <code>
  * $crawler->addLinkPriority("/forum/", 10);
  * $cralwer->addLinkPriority("/\.gif/", 5);
  * </code>
  *
  * @param string $regex Regular expression definig the rule
  * @param int $level The priority-level
  *
  * @return bool  TRUE if a valid preg-pattern is given as argument and was succsessfully added, otherwise it returns FALSE.
  * @section 10 Other settings
  */
 function addLinkPriority($regex, $level)
 {
     $check = PHPCrawlerUtils::checkRegexPattern($regex);
     // Check pattern
     if ($check == true && preg_match("/^[0-9]*\$/", $level)) {
         $c = count($this->link_priority_array);
         $this->link_priority_array[$c]["match"] = trim($regex);
         $this->link_priority_array[$c]["level"] = trim($level);
         return true;
     } else {
         return false;
     }
 }
Example #10
0
 /**
  * Adds a rule to the list of rules that decide which URLs found on a page should be ignored by the crawler.
  */
 public function addURLFilterRule($regex)
 {
     $check = PHPCrawlerUtils::checkRegexPattern($regex);
     // Check pattern
     if ($check == true) {
         $this->url_filter_rules[] = trim($regex);
     }
     return $check;
 }
 /**
  * Sets/writes the current crawler-status
  *
  * @param PHPCrawlerStatus $crawler_status The status to set
  */
 public function setCrawlerStatus(PHPCrawlerStatus $crawler_status)
 {
     $this->crawlerStatus = $crawler_status;
     // Write crawler-status back to file
     if ($this->write_status_to_file == true) {
         PHPCrawlerUtils::serializeToFile($this->working_directory . "crawlerstatus.tmp", $crawler_status);
     }
 }
 /**
  * Returns an array containig regular-expressions corresponding
  * to the given robots.txt-style "Disallow"-lines
  *
  * @param array &$applying_lines Numeric array containing "disallow"-lines.
  * @param string $base_url Base-URL the robots.txt-file was found in.
  *
  * @return array  Numeric array containing regular-expresseions created for each "disallow"-line.
  */
 protected function buildRegExpressions($applying_lines, $base_url)
 {
     // First, get all "Disallow:"-pathes
     $disallow_pathes = array();
     $cnt = count($applying_lines);
     for ($x = 0; $x < $cnt; $x++) {
         preg_match("#^Disallow:\\s*(.*)# i", $applying_lines[$x], $match);
         if (!empty($match[1])) {
             $path = trim($match[1]);
             // Add leading slash
             if (substr($path, 0, 1) != "/") {
                 $path = "/" . $path;
             }
             $disallow_pathes[] = $path;
         }
     }
     // Works like this:
     // The base-url is http://www.foo.com.
     // The driective says: "Disallow: /bla/"
     // This means: The nonFollowMatch is "#^http://www\.foo\.com/bla/#"
     $normalized_base_url = PHPCrawlerUtils::normalizeURL($base_url);
     $non_follow_expressions = array();
     $cnt = count($disallow_pathes);
     for ($x = 0; $x < $cnt; $x++) {
         $non_follow_path_complpete = $normalized_base_url . $disallow_pathes[$x];
         // "http://www.foo.com/bla/"
         $non_follow_exp = preg_quote($non_follow_path_complpete, "#");
         // "http://www\.foo\.com/bla/"
         $non_follow_exp = "#^" . $non_follow_exp . "#";
         // "#^http://www\.foo\.com/bla/#"
         $non_follow_expressions[] = $non_follow_exp;
     }
     return $non_follow_expressions;
 }
 /**
  * Returns all cookies from the cache that are adressed to the given URL
  *
  * @param string $target_url The target-URL
  * @return array  Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
  */
 public function getCookiesForUrl($target_url)
 {
     PHPCrawlerBenchmark::start("getting_cookies_from_cache");
     $url_parts = PHPCrawlerUtils::splitURL($target_url);
     $return_cookies = array();
     $Result = $this->PDO->query("SELECT * FROM cookies WHERE source_domain = '" . $url_parts["domain"] . "';");
     $rows = $Result->fetchAll(PDO::FETCH_ASSOC);
     $Result->closeCursor();
     $cnt = count($rows);
     for ($x = 0; $x < $cnt; $x++) {
         // Does the cookie-domain match?
         // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html:
         // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com"
         if ($rows[$x]["domain"] == $url_parts["host"] || preg_match("#" . preg_quote($rows[$x]["domain"]) . "\$#", $url_parts["host"])) {
             // Does the path match?
             if (preg_match("#^" . preg_quote($rows[$x]["path"]) . "#", $url_parts["path"])) {
                 $Cookie = new PHPCrawlerCookieDescriptor($rows[$x]["source_url"], $rows[$x]["name"], $rows[$x]["value"], $rows[$x]["expires"], $rows[$x]["path"], $rows[$x]["domain"]);
                 $return_cookies[$Cookie->name] = $Cookie;
                 // Use cookie-name as index to avoid double-cookies
             }
         }
     }
     // Convert to numeric array
     $return_cookies = array_values($return_cookies);
     PHPCrawlerBenchmark::stop("getting_cookies_from_cache");
     return $return_cookies;
 }
 /**
  * Adds a rule to the list of rules that decide what kind of documents should get
  * checked for links in (regarding their content-type)
  *
  * @param string $regex Regular-expression defining the rule
  * @return bool         TRUE if the rule was successfully added
  */
 public function addLinkSearchContentType($regex)
 {
     $check = PHPCrawlerUtils::checkRegexPattern($regex);
     // Check pattern
     if ($check == true) {
         $this->linksearch_content_types[] = trim($regex);
     }
     return $check;
 }