PHP PHPCrawl\Utils PHPCrawlerUtils::splitURL Beispiele

Programmiersprache: PHP

Namespace / Paketname: PHPCrawl\Utils

Klasse / Typ: PHPCrawlerUtils

Methode / Funktion: splitURL

Beispiele auf hotexamples.com: 9

PHP PHPCrawl\Utils PHPCrawlerUtils::splitURL - 9 Beispiele gefunden. Dies sind die am besten bewerteten PHP Beispiele für die PHPCrawl\Utils\PHPCrawlerUtils::splitURL, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

splitURL(9)

checkRegexPattern(4)

normalizeURL(2)

getHeaderValue(2)

getSystemTempDir(1)

sort2dArray(1)

serializeToFile(1)

rmDir(1)

isValidUrlString(1)

getURIContent(1)

buildURLFromLink(1)

getRootUrl(1)

buildURLFromParts(1)

getMetaTagAttributes(1)

getHTTPStatusCode(1)

getCookiesFromHeader(1)

getBaseUrlFromMetaTag(1)

deserializeFromFile(1)

checkStringAgainstRegexArray(1)

getRedirectURLFromHeader(1)

Beispiel #1

Datei anzeigen

Datei: PHPCrawlerCookieDescriptor.php Projekt: dawid-z/phpcrawl

 /**
  * Initiates a new PHPCrawlerCookieDescriptor-object.
  *
  * @param string $source_url URL the cookie was send from.
  * @param string $name Cookie-name
  * @param string $value Cookie-value
  * @param string $expires Expire-string, e.g. "Sat, 08-Aug-2020 23:59:08 GMT"
  * @param string $path Cookie-path
  * @param string $domain Cookie-domain
  * @internal
  */
 public function __construct($source_url, $name, $value, $expires = null, $path = null, $domain = null)
 {
     // For cookie-specs, see e.g. http://curl.haxx.se/rfc/cookie_spec.html
     $this->name = $name;
     $this->value = $value;
     $this->expires = $expires;
     $this->path = $path;
     $this->domain = $domain;
     $source_url_parts = PHPCrawlerUtils::splitURL($source_url);
     // Source-domain
     $this->source_domain = $source_url_parts["domain"];
     // Source-URL
     $this->source_url = $source_url;
     // Send-time
     $this->cookie_send_time = PHPCrawlerBenchmark::getmicrotime();
     // Expire-date to timetsamp
     if ($this->expires != null) {
         $this->expire_timestamp = @strtotime($this->expires);
     }
     // If domain doesn't start with "." -> add it (see RFC)
     if ($this->domain != null && substr($this->domain, 0, 1) != ".") {
         $this->domain = "." . $this->domain;
     }
     // Comeplete missing values
     // If domain no set -> domain is the host of the source-url WITHOUT leading "."! (see RFC)
     if ($this->domain == null) {
         $this->domain = $source_url_parts["host"];
     }
     // If path not set
     if ($this->path == null) {
         $this->path = $source_url_parts["path"];
     }
 }

Beispiel #2

Datei anzeigen

Datei: PHPCrawlerMemoryCookieCache.php Projekt: dawid-z/phpcrawl

 /**
  * Returns all cookies from the cache that are adressed to the given URL
  *
  * @param string $target_url The target-URL
  * @return array  Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
  */
 public function getCookiesForUrl($target_url)
 {
     $url_parts = PHPCrawlerUtils::splitURL($target_url);
     $target_domain = $url_parts["domain"];
     // e.g. acme.com
     $return_cookies = array();
     // Iterate over all cookies of this domain
     @reset($this->cookies[$target_domain]);
     while (list($hash) = @each($this->cookies[$target_domain])) {
         $Cookie = $this->cookies[$target_domain][$hash];
         // Does the cookie-domain match?
         // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html:
         // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com"
         // Seems like ".acme.com" should also match "anvil.acme.com", so just remove the dot
         $Cookie->domain = preg_replace("#^.#", "", $Cookie->domain);
         if ($Cookie->domain == $url_parts["host"] || preg_match("#" . preg_quote($Cookie->domain) . "\$#", $url_parts["host"])) {
             // Does the path match?
             if (preg_match("#^" . preg_quote($Cookie->path) . "#", $url_parts["path"])) {
                 $return_cookies[$Cookie->name] = $Cookie;
                 // Use cookie-name as index to avoid double-cookies
             }
         }
     }
     // Convert to numeric array
     $return_cookies = array_values($return_cookies);
     return $return_cookies;
 }

Beispiel #3

Datei anzeigen

Datei: PHPCrawlerUrlPartsDescriptor.php Projekt: dawid-z/phpcrawl

 /**
  * Returns the PHPCrawlerUrlPartsDescriptor-object for the given URL.
  *
  * @return PHPCrawlerUrlPartsDescriptor
  */
 public static function fromURL($url)
 {
     $parts = PHPCrawlerUtils::splitURL($url);
     $tmp = new PHPCrawlerUrlPartsDescriptor();
     $tmp->protocol = $parts["protocol"];
     $tmp->host = $parts["host"];
     $tmp->path = $parts["path"];
     $tmp->file = $parts["file"];
     $tmp->domain = $parts["domain"];
     $tmp->port = $parts["port"];
     $tmp->auth_username = $parts["auth_username"];
     $tmp->auth_password = $parts["auth_password"];
     return $tmp;
 }

Beispiel #4

Datei anzeigen

Datei: PHPCrawlerDNSCache.php Projekt: dawid-z/phpcrawl

 /**
  * Checks whether the hostname of the given URL is already cached
  *
  * @param PHPCrawlerURLDescriptor $URL The URL
  * @return bool
  */
 public function urlHostInCache(PHPCrawlerURLDescriptor $URL)
 {
     $url_parts = PHPCrawlerUtils::splitURL($URL->url_rebuild);
     return $this->hostInCache($url_parts["host"]);
 }

Beispiel #5

Datei anzeigen

Datei: PHPCrawler.php Projekt: dawid-z/phpcrawl

 /**
  * Sets the port to connect to for crawling the starting-url set in setUrl().
  *
  * The default port is 80.
  *
  * Note:
  * <code>
  * $cralwer->setURL("http://www.foo.com");
  * $crawler->setPort(443);
  * </code>
  * effects the same as
  *
  * <code>
  * $cralwer->setURL("http://www.foo.com:443");
  * </code>
  *
  * @param int $port The port
  * @return bool
  * @section 1 Basic settings
  */
 public function setPort($port)
 {
     // Check port
     if (!preg_match("#^[0-9]{1,5}\$#", $port)) {
         return false;
     }
     // Add port to the starting-URL
     $url_parts = PHPCrawlerUtils::splitURL($this->starting_url);
     $url_parts["port"] = $port;
     $this->starting_url = PHPCrawlerUtils::buildURLFromParts($url_parts, true);
     return true;
 }

Beispiel #6

Datei anzeigen

Datei: PHPCrawlerURLFilter.php Projekt: dawid-z/phpcrawl

 /**
  * Checks whether a given URL matches the rules applied to the URLFilter.
  *
  * @param string $url The URL as a PHPCrawlerURLDescriptor-object
  * @return bool TRUE if the URL matches the defined rules.
  */
 protected function urlMatchesRules(PHPCrawlerURLDescriptor $url)
 {
     // URL-parts of the URL to check against the filter-rules
     $url_parts = PHPCrawlerUtils::splitURL($url->url_rebuild);
     // Kick out all links that are NOT of protocol "http" or "https"
     if ($url_parts["protocol"] != "http://" && $url_parts["protocol"] != "https://") {
         return false;
     }
     // Kick out URLs exceeding the maximum crawling-depth
     if ($this->max_crawling_depth !== null && $url->url_link_depth > $this->max_crawling_depth) {
         return false;
     }
     // If meta-tag "robots"->"nofollow" is present and obey_nofollow_tags is TRUE -> always kick out URL
     if ($this->obey_nofollow_tags == true && isset($this->CurrentDocumentInfo->meta_attributes["robots"]) && preg_match("#nofollow# i", $this->CurrentDocumentInfo->meta_attributes["robots"])) {
         return false;
     }
     // If linkcode contains "rel='nofollow'" and obey_nofollow_tags is TRUE -> always kick out URL
     if ($this->obey_nofollow_tags == true) {
         if (preg_match("#^<[^>]*rel\\s*=\\s*(?|\"\\s*nofollow\\s*\"|'\\s*nofollow\\s*'|\\s*nofollow\\s*)[^>]*>#", $url->linkcode)) {
             return false;
         }
     }
     // Filter URLs to other domains if wanted
     if ($this->general_follow_mode >= 1) {
         if ($url_parts["domain"] != $this->starting_url_parts["domain"]) {
             return false;
         }
     }
     // Filter URLs to other hosts if wanted
     if ($this->general_follow_mode >= 2) {
         // Ignore "www." at the beginning of the host, because "www.foo.com" is the same host as "foo.com"
         if (preg_replace("#^www\\.#", "", $url_parts["host"]) != preg_replace("#^www\\.#", "", $this->starting_url_parts["host"])) {
             return false;
         }
     }
     // Filter URLs leading path-up if wanted
     if ($this->general_follow_mode == 3) {
         if ($url_parts["protocol"] != $this->starting_url_parts["protocol"] || preg_replace("#^www\\.#", "", $url_parts["host"]) != preg_replace("#^www\\.#", "", $this->starting_url_parts["host"]) || substr($url_parts["path"], 0, strlen($this->starting_url_parts["path"])) != $this->starting_url_parts["path"]) {
             return false;
         }
     }
     // Filter URLs by url_filter_rules
     for ($x = 0; $x < count($this->url_filter_rules); $x++) {
         if (preg_match($this->url_filter_rules[$x], $url->url_rebuild)) {
             return false;
         }
     }
     // Filter URLs by url_follow_rules
     if (count($this->url_follow_rules) > 0) {
         $match_found = false;
         for ($x = 0; $x < count($this->url_follow_rules); $x++) {
             if (preg_match($this->url_follow_rules[$x], $url->url_rebuild)) {
                 $match_found = true;
                 break;
             }
         }
         if ($match_found == false) {
             return false;
         }
     }
     return true;
 }

Beispiel #7

Datei anzeigen

Datei: PHPCrawlerRobotsTxtParser.php Projekt: dawid-z/phpcrawl

 /**
  * Returns the default Robots.txt-URL related to the given URL
  *
  * @param string $url The URL
  * @return string Url of the related robots.txt file
  */
 public static function getRobotsTxtURL($url)
 {
     $url_parts = PHPCrawlerUtils::splitURL($url);
     $robots_txt_url = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . "/robots.txt";
     return $robots_txt_url;
 }

Beispiel #8

Datei anzeigen

Datei: PHPCrawlerSQLiteCookieCache.php Projekt: dawid-z/phpcrawl

 /**
  * Returns all cookies from the cache that are adressed to the given URL
  *
  * @param string $target_url The target-URL
  * @return array  Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
  */
 public function getCookiesForUrl($target_url)
 {
     PHPCrawlerBenchmark::start("getting_cookies_from_cache");
     $url_parts = PHPCrawlerUtils::splitURL($target_url);
     $return_cookies = array();
     $Result = $this->PDO->query("SELECT * FROM cookies WHERE source_domain = '" . $url_parts["domain"] . "';");
     $rows = $Result->fetchAll(PDO::FETCH_ASSOC);
     $Result->closeCursor();
     $cnt = count($rows);
     for ($x = 0; $x < $cnt; $x++) {
         // Does the cookie-domain match?
         // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html:
         // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com"
         if ($rows[$x]["domain"] == $url_parts["host"] || preg_match("#" . preg_quote($rows[$x]["domain"]) . "\$#", $url_parts["host"])) {
             // Does the path match?
             if (preg_match("#^" . preg_quote($rows[$x]["path"]) . "#", $url_parts["path"])) {
                 $Cookie = new PHPCrawlerCookieDescriptor($rows[$x]["source_url"], $rows[$x]["name"], $rows[$x]["value"], $rows[$x]["expires"], $rows[$x]["path"], $rows[$x]["domain"]);
                 $return_cookies[$Cookie->name] = $Cookie;
                 // Use cookie-name as index to avoid double-cookies
             }
         }
     }
     // Convert to numeric array
     $return_cookies = array_values($return_cookies);
     PHPCrawlerBenchmark::stop("getting_cookies_from_cache");
     return $return_cookies;
 }

Beispiel #9

Datei anzeigen

Datei: PHPCrawlerHTTPRequest.php Projekt: dawid-z/phpcrawl

 /**
  * Sets the URL for the request.
  *
  * @param PHPCrawlerURLDescriptor $UrlDescriptor An PHPCrawlerURLDescriptor-object containing the URL to request
  */
 public function setUrl(PHPCrawlerURLDescriptor $UrlDescriptor)
 {
     $this->UrlDescriptor = $UrlDescriptor;
     // Split the URL into its parts
     $this->url_parts = PHPCrawlerUtils::splitURL($UrlDescriptor->url_rebuild);
 }