/** * Parses a robots.txt-file and returns regular-expression-rules corresponding to the containing "disallow"-rules * that are adressed to the given user-agent. * * @param PHPCrawlerURLDescriptor $BaseUrl The root-URL all rules from the robots-txt-file should relate to * @param string $user_agent_string The useragent all rules from the robots-txt-file should relate to * @param string $robots_txt_uri Optional. The location of the robots.txt-file as URI. * If not set, the default robots.txt-file for the given BaseUrl gets parsed. * * @return array Numeric array containing regular-expressions for each "disallow"-rule defined in the robots.txt-file * that's adressed to the given user-agent. */ public function parseRobotsTxt(PHPCrawlerURLDescriptor $BaseUrl, $user_agent_string, $robots_txt_uri = null) { PHPCrawlerBenchmark::start("processing_robotstxt"); // If robots_txt_uri not given, use the default one for the given BaseUrl if ($robots_txt_uri === null) { $robots_txt_uri = self::getRobotsTxtURL($BaseUrl->url_rebuild); } // Get robots.txt-content $robots_txt_content = PHPCrawlerUtils::getURIContent($robots_txt_uri, $user_agent_string); $non_follow_reg_exps = array(); // If content was found if ($robots_txt_content != null) { // Get all lines in the robots.txt-content that are adressed to our user-agent. $applying_lines = $this->getUserAgentLines($robots_txt_content, $user_agent_string); // Get valid reg-expressions for the given disallow-pathes. $non_follow_reg_exps = $this->buildRegExpressions($applying_lines, PHPCrawlerUtils::getRootUrl($BaseUrl->url_rebuild)); } PHPCrawlerBenchmark::stop("processing_robots.txt"); return $non_follow_reg_exps; }