/** * Returns an array containig regular-expressions corresponding * to the given robots.txt-style "Disallow"-lines * * @param array &$applying_lines Numeric array containing "disallow"-lines. * @param string $base_url Base-URL the robots.txt-file was found in. * * @return array Numeric array containing regular-expresseions created for each "disallow"-line. */ protected function buildRegExpressions($applying_lines, $base_url) { // First, get all "Disallow:"-pathes $disallow_pathes = array(); $cnt = count($applying_lines); for ($x = 0; $x < $cnt; $x++) { preg_match("#^Disallow:\\s*(.*)# i", $applying_lines[$x], $match); if (!empty($match[1])) { $path = trim($match[1]); // Add leading slash if (substr($path, 0, 1) != "/") { $path = "/" . $path; } $disallow_pathes[] = $path; } } // Works like this: // The base-url is http://www.foo.com. // The driective says: "Disallow: /bla/" // This means: The nonFollowMatch is "#^http://www\.foo\.com/bla/#" $normalized_base_url = PHPCrawlerUtils::normalizeURL($base_url); $non_follow_expressions = array(); $cnt = count($disallow_pathes); for ($x = 0; $x < $cnt; $x++) { $non_follow_path_complpete = $normalized_base_url . $disallow_pathes[$x]; // "http://www.foo.com/bla/" $non_follow_exp = preg_quote($non_follow_path_complpete, "#"); // "http://www\.foo\.com/bla/" $non_follow_exp = "#^" . $non_follow_exp . "#"; // "#^http://www\.foo\.com/bla/#" $non_follow_expressions[] = $non_follow_exp; } return $non_follow_expressions; }
/** * Sets the URL of the first page the crawler should crawl (root-page). * * The given url may contain the protocol (http://www.foo.com or https://www.foo.com), the port (http://www.foo.com:4500/index.php) * and/or basic-authentication-data (http://loginname:passwd@www.foo.com) * * This url has to be set before calling the {@link go()}-method (of course)! * If this root-page doesn't contain any further links, the crawling-process will stop immediately. * * @param string $url The URL * @return bool * * @section 1 Basic settings */ public function setURL($url) { $url = trim($url); if ($url != "" && is_string($url)) { $this->starting_url = PHPCrawlerUtils::normalizeURL($url); return true; } else { return false; } }