Ejemplos de PHPCrawl\Utils PHPCrawlerUtils::splitURL en PHP

Lenguaje de programación: PHP

Namespace/Package Name: PHPCrawl\Utils

Clase / Tipo: PHPCrawlerUtils

Método / Función: splitURL

Ejemplos en hotexamples.com: 9

PHP PHPCrawl\Utils PHPCrawlerUtils::splitURL - 9 ejemplos encontrados. Estos son los ejemplos en PHP del mundo real mejor valorados de PHPCrawl\Utils\PHPCrawlerUtils::splitURL extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

splitURL(9)

checkRegexPattern(4)

normalizeURL(2)

getHeaderValue(2)

getSystemTempDir(1)

sort2dArray(1)

serializeToFile(1)

rmDir(1)

isValidUrlString(1)

getURIContent(1)

buildURLFromLink(1)

getRootUrl(1)

buildURLFromParts(1)

getMetaTagAttributes(1)

getHTTPStatusCode(1)

getCookiesFromHeader(1)

getBaseUrlFromMetaTag(1)

deserializeFromFile(1)

checkStringAgainstRegexArray(1)

getRedirectURLFromHeader(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: PHPCrawlerCookieDescriptor.php Proyecto: dawid-z/phpcrawl

 /**
  * Initiates a new PHPCrawlerCookieDescriptor-object.
  *
  * @param string $source_url URL the cookie was send from.
  * @param string $name Cookie-name
  * @param string $value Cookie-value
  * @param string $expires Expire-string, e.g. "Sat, 08-Aug-2020 23:59:08 GMT"
  * @param string $path Cookie-path
  * @param string $domain Cookie-domain
  * @internal
  */
 public function __construct($source_url, $name, $value, $expires = null, $path = null, $domain = null)
 {
     // For cookie-specs, see e.g. http://curl.haxx.se/rfc/cookie_spec.html
     $this->name = $name;
     $this->value = $value;
     $this->expires = $expires;
     $this->path = $path;
     $this->domain = $domain;
     $source_url_parts = PHPCrawlerUtils::splitURL($source_url);
     // Source-domain
     $this->source_domain = $source_url_parts["domain"];
     // Source-URL
     $this->source_url = $source_url;
     // Send-time
     $this->cookie_send_time = PHPCrawlerBenchmark::getmicrotime();
     // Expire-date to timetsamp
     if ($this->expires != null) {
         $this->expire_timestamp = @strtotime($this->expires);
     }
     // If domain doesn't start with "." -> add it (see RFC)
     if ($this->domain != null && substr($this->domain, 0, 1) != ".") {
         $this->domain = "." . $this->domain;
     }
     // Comeplete missing values
     // If domain no set -> domain is the host of the source-url WITHOUT leading "."! (see RFC)
     if ($this->domain == null) {
         $this->domain = $source_url_parts["host"];
     }
     // If path not set
     if ($this->path == null) {
         $this->path = $source_url_parts["path"];
     }
 }

Ejemplo n.º 2

Mostrar archivo

Archivo: PHPCrawlerMemoryCookieCache.php Proyecto: dawid-z/phpcrawl

 /**
  * Returns all cookies from the cache that are adressed to the given URL
  *
  * @param string $target_url The target-URL
  * @return array  Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
  */
 public function getCookiesForUrl($target_url)
 {
     $url_parts = PHPCrawlerUtils::splitURL($target_url);
     $target_domain = $url_parts["domain"];
     // e.g. acme.com
     $return_cookies = array();
     // Iterate over all cookies of this domain
     @reset($this->cookies[$target_domain]);
     while (list($hash) = @each($this->cookies[$target_domain])) {
         $Cookie = $this->cookies[$target_domain][$hash];
         // Does the cookie-domain match?
         // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html:
         // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com"
         // Seems like ".acme.com" should also match "anvil.acme.com", so just remove the dot
         $Cookie->domain = preg_replace("#^.#", "", $Cookie->domain);
         if ($Cookie->domain == $url_parts["host"] || preg_match("#" . preg_quote($Cookie->domain) . "\$#", $url_parts["host"])) {
             // Does the path match?
             if (preg_match("#^" . preg_quote($Cookie->path) . "#", $url_parts["path"])) {
                 $return_cookies[$Cookie->name] = $Cookie;
                 // Use cookie-name as index to avoid double-cookies
             }
         }
     }
     // Convert to numeric array
     $return_cookies = array_values($return_cookies);
     return $return_cookies;
 }

Ejemplo n.º 3

Mostrar archivo

Archivo: PHPCrawlerUrlPartsDescriptor.php Proyecto: dawid-z/phpcrawl

 /**
  * Returns the PHPCrawlerUrlPartsDescriptor-object for the given URL.
  *
  * @return PHPCrawlerUrlPartsDescriptor
  */
 public static function fromURL($url)
 {
     $parts = PHPCrawlerUtils::splitURL($url);
     $tmp = new PHPCrawlerUrlPartsDescriptor();
     $tmp->protocol = $parts["protocol"];
     $tmp->host = $parts["host"];
     $tmp->path = $parts["path"];
     $tmp->file = $parts["file"];
     $tmp->domain = $parts["domain"];
     $tmp->port = $parts["port"];
     $tmp->auth_username = $parts["auth_username"];
     $tmp->auth_password = $parts["auth_password"];
     return $tmp;
 }

Ejemplo n.º 4

Mostrar archivo

Archivo: PHPCrawlerDNSCache.php Proyecto: dawid-z/phpcrawl

 /**
  * Checks whether the hostname of the given URL is already cached
  *
  * @param PHPCrawlerURLDescriptor $URL The URL
  * @return bool
  */
 public function urlHostInCache(PHPCrawlerURLDescriptor $URL)
 {
     $url_parts = PHPCrawlerUtils::splitURL($URL->url_rebuild);
     return $this->hostInCache($url_parts["host"]);
 }

Ejemplo n.º 5

Mostrar archivo

Archivo: PHPCrawler.php Proyecto: dawid-z/phpcrawl

 /**
  * Sets the port to connect to for crawling the starting-url set in setUrl().
  *
  * The default port is 80.
  *
  * Note:
  * <code>
  * $cralwer->setURL("http://www.foo.com");
  * $crawler->setPort(443);
  * </code>
  * effects the same as
  *
  * <code>
  * $cralwer->setURL("http://www.foo.com:443");
  * </code>
  *
  * @param int $port The port
  * @return bool
  * @section 1 Basic settings
  */
 public function setPort($port)
 {
     // Check port
     if (!preg_match("#^[0-9]{1,5}\$#", $port)) {
         return false;
     }
     // Add port to the starting-URL
     $url_parts = PHPCrawlerUtils::splitURL($this->starting_url);
     $url_parts["port"] = $port;
     $this->starting_url = PHPCrawlerUtils::buildURLFromParts($url_parts, true);
     return true;
 }

Ejemplo n.º 6

Mostrar archivo

Archivo: PHPCrawlerURLFilter.php Proyecto: dawid-z/phpcrawl

 /**
  * Checks whether a given URL matches the rules applied to the URLFilter.
  *
  * @param string $url The URL as a PHPCrawlerURLDescriptor-object
  * @return bool TRUE if the URL matches the defined rules.
  */
 protected function urlMatchesRules(PHPCrawlerURLDescriptor $url)
 {
     // URL-parts of the URL to check against the filter-rules
     $url_parts = PHPCrawlerUtils::splitURL($url->url_rebuild);
     // Kick out all links that are NOT of protocol "http" or "https"
     if ($url_parts["protocol"] != "http://" && $url_parts["protocol"] != "https://") {
         return false;
     }
     // Kick out URLs exceeding the maximum crawling-depth
     if ($this->max_crawling_depth !== null && $url->url_link_depth > $this->max_crawling_depth) {
         return false;
     }
     // If meta-tag "robots"->"nofollow" is present and obey_nofollow_tags is TRUE -> always kick out URL
     if ($this->obey_nofollow_tags == true && isset($this->CurrentDocumentInfo->meta_attributes["robots"]) && preg_match("#nofollow# i", $this->CurrentDocumentInfo->meta_attributes["robots"])) {
         return false;
     }
     // If linkcode contains "rel='nofollow'" and obey_nofollow_tags is TRUE -> always kick out URL
     if ($this->obey_nofollow_tags == true) {
         if (preg_match("#^<[^>]*rel\\s*=\\s*(?|\"\\s*nofollow\\s*\"|'\\s*nofollow\\s*'|\\s*nofollow\\s*)[^>]*>#", $url->linkcode)) {
             return false;
         }
     }
     // Filter URLs to other domains if wanted
     if ($this->general_follow_mode >= 1) {
         if ($url_parts["domain"] != $this->starting_url_parts["domain"]) {
             return false;
         }
     }
     // Filter URLs to other hosts if wanted
     if ($this->general_follow_mode >= 2) {
         // Ignore "www." at the beginning of the host, because "www.foo.com" is the same host as "foo.com"
         if (preg_replace("#^www\\.#", "", $url_parts["host"]) != preg_replace("#^www\\.#", "", $this->starting_url_parts["host"])) {
             return false;
         }
     }
     // Filter URLs leading path-up if wanted
     if ($this->general_follow_mode == 3) {
         if ($url_parts["protocol"] != $this->starting_url_parts["protocol"] || preg_replace("#^www\\.#", "", $url_parts["host"]) != preg_replace("#^www\\.#", "", $this->starting_url_parts["host"]) || substr($url_parts["path"], 0, strlen($this->starting_url_parts["path"])) != $this->starting_url_parts["path"]) {
             return false;
         }
     }
     // Filter URLs by url_filter_rules
     for ($x = 0; $x < count($this->url_filter_rules); $x++) {
         if (preg_match($this->url_filter_rules[$x], $url->url_rebuild)) {
             return false;
         }
     }
     // Filter URLs by url_follow_rules
     if (count($this->url_follow_rules) > 0) {
         $match_found = false;
         for ($x = 0; $x < count($this->url_follow_rules); $x++) {
             if (preg_match($this->url_follow_rules[$x], $url->url_rebuild)) {
                 $match_found = true;
                 break;
             }
         }
         if ($match_found == false) {
             return false;
         }
     }
     return true;
 }

Ejemplo n.º 7

Mostrar archivo

Archivo: PHPCrawlerRobotsTxtParser.php Proyecto: dawid-z/phpcrawl

 /**
  * Returns the default Robots.txt-URL related to the given URL
  *
  * @param string $url The URL
  * @return string Url of the related robots.txt file
  */
 public static function getRobotsTxtURL($url)
 {
     $url_parts = PHPCrawlerUtils::splitURL($url);
     $robots_txt_url = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . "/robots.txt";
     return $robots_txt_url;
 }

Ejemplo n.º 8

Mostrar archivo

Archivo: PHPCrawlerSQLiteCookieCache.php Proyecto: dawid-z/phpcrawl

 /**
  * Returns all cookies from the cache that are adressed to the given URL
  *
  * @param string $target_url The target-URL
  * @return array  Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
  */
 public function getCookiesForUrl($target_url)
 {
     PHPCrawlerBenchmark::start("getting_cookies_from_cache");
     $url_parts = PHPCrawlerUtils::splitURL($target_url);
     $return_cookies = array();
     $Result = $this->PDO->query("SELECT * FROM cookies WHERE source_domain = '" . $url_parts["domain"] . "';");
     $rows = $Result->fetchAll(PDO::FETCH_ASSOC);
     $Result->closeCursor();
     $cnt = count($rows);
     for ($x = 0; $x < $cnt; $x++) {
         // Does the cookie-domain match?
         // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html:
         // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com"
         if ($rows[$x]["domain"] == $url_parts["host"] || preg_match("#" . preg_quote($rows[$x]["domain"]) . "\$#", $url_parts["host"])) {
             // Does the path match?
             if (preg_match("#^" . preg_quote($rows[$x]["path"]) . "#", $url_parts["path"])) {
                 $Cookie = new PHPCrawlerCookieDescriptor($rows[$x]["source_url"], $rows[$x]["name"], $rows[$x]["value"], $rows[$x]["expires"], $rows[$x]["path"], $rows[$x]["domain"]);
                 $return_cookies[$Cookie->name] = $Cookie;
                 // Use cookie-name as index to avoid double-cookies
             }
         }
     }
     // Convert to numeric array
     $return_cookies = array_values($return_cookies);
     PHPCrawlerBenchmark::stop("getting_cookies_from_cache");
     return $return_cookies;
 }

Ejemplo n.º 9

Mostrar archivo

Archivo: PHPCrawlerHTTPRequest.php Proyecto: dawid-z/phpcrawl

 /**
  * Sets the URL for the request.
  *
  * @param PHPCrawlerURLDescriptor $UrlDescriptor An PHPCrawlerURLDescriptor-object containing the URL to request
  */
 public function setUrl(PHPCrawlerURLDescriptor $UrlDescriptor)
 {
     $this->UrlDescriptor = $UrlDescriptor;
     // Split the URL into its parts
     $this->url_parts = PHPCrawlerUtils::splitURL($UrlDescriptor->url_rebuild);
 }