/** * Returns an array containig regular-expressions corresponding * to the given robots.txt-style "Disallow"-lines * * @param array &$applying_lines Numeric array containing "disallow"-lines. * @param string $base_url Base-URL the robots.txt-file was found in. * * @return array Numeric array containing regular-expresseions created for each "disallow"-line. */ protected function buildRegExpressions(&$applying_lines, $base_url) { // First, get all "Disallow:"-paths $disallow_pathes = array(); for ($x = 0; $x < count($applying_lines); $x++) { if (preg_match("#^Disallow:# i", $applying_lines[$x])) { preg_match("#^Disallow:[ ]*(.*)#", $applying_lines[$x], $match); if (isset($match[1])) { $disallow_pathes[] = trim($match[1]); } } } // Works like this: // The base-url is http://www.foo.com. // The driective says: "Disallow: /bla/" // This means: The nonFollowMatch is "#^http://www\.foo\.com/bla/#" $normalized_base_url = PHPCrawlerUtils::normalizeURL($base_url); $non_follow_expressions = array(); for ($x = 0; $x < count($disallow_pathes); $x++) { // If the disallow-path is empty -> simply ignore it if ($disallow_pathes[$x] == "") { continue; } $non_follow_path_complpete = $normalized_base_url . "/" . substr($disallow_pathes[$x], 1); // "http://www.foo.com/bla/" $non_follow_exp = preg_quote($non_follow_path_complpete, "#"); // "http://www\.foo\.com/bla/" $non_follow_exp = "#^" . $non_follow_exp . "#"; // "#^http://www\.foo\.com/bla/#" $non_follow_expressions[] = $non_follow_exp; } return $non_follow_expressions; }
/** * Returns an array containig regular-expressions corresponding * to the given robots.txt-style "Disallow"-lines * * @param array &$applying_lines Numeric array containing "disallow"-lines. * @param string $base_url Base-URL the robots.txt-file was found in. * * @return array Numeric array containing regular-expresseions created for each "disallow"-line. */ protected function buildRegExpressions($applying_lines, $base_url) { // First, get all "Disallow:"-pathes $disallow_pathes = array(); $cnt = count($applying_lines); for ($x = 0; $x < $cnt; $x++) { preg_match("#^Disallow:\\s*(.*)# i", $applying_lines[$x], $match); if (!empty($match[1])) { $path = trim($match[1]); // Add leading slash if (substr($path, 0, 1) != "/") { $path = "/" . $path; } $disallow_pathes[] = $path; } } // Works like this: // The base-url is http://www.foo.com. // The driective says: "Disallow: /bla/" // This means: The nonFollowMatch is "#^http://www\.foo\.com/bla/#" $normalized_base_url = PHPCrawlerUtils::normalizeURL($base_url); $non_follow_expressions = array(); $cnt = count($disallow_pathes); for ($x = 0; $x < $cnt; $x++) { $non_follow_path_complpete = $normalized_base_url . $disallow_pathes[$x]; // "http://www.foo.com/bla/" $non_follow_exp = preg_quote($non_follow_path_complpete, "#"); // "http://www\.foo\.com/bla/" $non_follow_exp = "#^" . $non_follow_exp . "#"; // "#^http://www\.foo\.com/bla/#" $non_follow_expressions[] = $non_follow_exp; } return $non_follow_expressions; }
/** * Sets the URL of the first page the crawler should crawl (root-page). * * The given url may contain the protocol (http://www.foo.com or https://www.foo.com), the port (http://www.foo.com:4500/index.php) * and/or basic-authentication-data (http://loginname:passwd@www.foo.com) * * This url has to be set before calling the {@link go()}-method (of course)! * If this root-page doesn't contain any further links, the crawling-process will stop immediately. * * @param string $url The URL * @return bool * * @section 1 Basic settings */ public function setURL($url) { $url = trim($url); if ($url != "" && is_string($url)) { $this->starting_url = PHPCrawlerUtils::normalizeURL($url); return true; } else { return false; } }
function setURL($url) { $this->initCrawler(); $url = trim($url); if ($url != "" && is_string($url)) { if (substr($url, 0, 7) != "http://" && substr($url, 0, 8) != "https://") { $url = "http://" . $url; } $this->url_to_crawl = PHPCrawlerUtils::normalizeURL($url); return true; } else { return false; } }