/**
  * Returns an array containig regular-expressions corresponding
  * to the given robots.txt-style "Disallow"-lines
  *
  * @param array &$applying_lines Numeric array containing "disallow"-lines.
  * @param string $base_url       Base-URL the robots.txt-file was found in.
  *
  * @return array  Numeric array containing regular-expresseions created for each "disallow"-line.
  */
 protected function buildRegExpressions(&$applying_lines, $base_url)
 {
     // First, get all "Disallow:"-paths
     $disallow_pathes = array();
     for ($x = 0; $x < count($applying_lines); $x++) {
         if (preg_match("#^Disallow:# i", $applying_lines[$x])) {
             preg_match("#^Disallow:[ ]*(.*)#", $applying_lines[$x], $match);
             if (isset($match[1])) {
                 $disallow_pathes[] = trim($match[1]);
             }
         }
     }
     // Works like this:
     // The base-url is http://www.foo.com.
     // The driective says: "Disallow: /bla/"
     // This means: The nonFollowMatch is "#^http://www\.foo\.com/bla/#"
     $normalized_base_url = PHPCrawlerUtils::normalizeURL($base_url);
     $non_follow_expressions = array();
     for ($x = 0; $x < count($disallow_pathes); $x++) {
         // If the disallow-path is empty -> simply ignore it
         if ($disallow_pathes[$x] == "") {
             continue;
         }
         $non_follow_path_complpete = $normalized_base_url . "/" . substr($disallow_pathes[$x], 1);
         // "http://www.foo.com/bla/"
         $non_follow_exp = preg_quote($non_follow_path_complpete, "#");
         // "http://www\.foo\.com/bla/"
         $non_follow_exp = "#^" . $non_follow_exp . "#";
         // "#^http://www\.foo\.com/bla/#"
         $non_follow_expressions[] = $non_follow_exp;
     }
     return $non_follow_expressions;
 }
 /**
  * Returns an array containig regular-expressions corresponding
  * to the given robots.txt-style "Disallow"-lines
  *
  * @param array &$applying_lines Numeric array containing "disallow"-lines.
  * @param string $base_url       Base-URL the robots.txt-file was found in.
  *
  * @return array  Numeric array containing regular-expresseions created for each "disallow"-line.
  */
 protected function buildRegExpressions($applying_lines, $base_url)
 {
     // First, get all "Disallow:"-pathes
     $disallow_pathes = array();
     $cnt = count($applying_lines);
     for ($x = 0; $x < $cnt; $x++) {
         preg_match("#^Disallow:\\s*(.*)# i", $applying_lines[$x], $match);
         if (!empty($match[1])) {
             $path = trim($match[1]);
             // Add leading slash
             if (substr($path, 0, 1) != "/") {
                 $path = "/" . $path;
             }
             $disallow_pathes[] = $path;
         }
     }
     // Works like this:
     // The base-url is http://www.foo.com.
     // The driective says: "Disallow: /bla/"
     // This means: The nonFollowMatch is "#^http://www\.foo\.com/bla/#"
     $normalized_base_url = PHPCrawlerUtils::normalizeURL($base_url);
     $non_follow_expressions = array();
     $cnt = count($disallow_pathes);
     for ($x = 0; $x < $cnt; $x++) {
         $non_follow_path_complpete = $normalized_base_url . $disallow_pathes[$x];
         // "http://www.foo.com/bla/"
         $non_follow_exp = preg_quote($non_follow_path_complpete, "#");
         // "http://www\.foo\.com/bla/"
         $non_follow_exp = "#^" . $non_follow_exp . "#";
         // "#^http://www\.foo\.com/bla/#"
         $non_follow_expressions[] = $non_follow_exp;
     }
     return $non_follow_expressions;
 }
Exemplo n.º 3
0
 /**
  * Sets the URL of the first page the crawler should crawl (root-page).
  *
  * The given url may contain the protocol (http://www.foo.com or https://www.foo.com), the port (http://www.foo.com:4500/index.php)
  * and/or basic-authentication-data (http://loginname:passwd@www.foo.com)
  *
  * This url has to be set before calling the {@link go()}-method (of course)!
  * If this root-page doesn't contain any further links, the crawling-process will stop immediately.
  *
  * @param string $url The URL
  * @return bool
  *
  * @section 1 Basic settings
  */
 public function setURL($url)
 {
     $url = trim($url);
     if ($url != "" && is_string($url)) {
         $this->starting_url = PHPCrawlerUtils::normalizeURL($url);
         return true;
     } else {
         return false;
     }
 }
Exemplo n.º 4
0
 function setURL($url)
 {
     $this->initCrawler();
     $url = trim($url);
     if ($url != "" && is_string($url)) {
         if (substr($url, 0, 7) != "http://" && substr($url, 0, 8) != "https://") {
             $url = "http://" . $url;
         }
         $this->url_to_crawl = PHPCrawlerUtils::normalizeURL($url);
         return true;
     } else {
         return false;
     }
 }