/**
  * Parses the robots.txt-file related to the given URL and returns regular-expression-rules
  * corresponding to the containing "disallow"-rules that are adressed to the given user-agent.
  *
  * @param PHPCrawlerURLDescriptor $Url The URL
  * @param string $user_agent_string User-agent.
  *
  * @return array Numeric array containing regular-expressions for each "disallow"-rule defined in the robots.txt-file
  *               that's adressed to the given user-agent.
  */
 public function parseRobotsTxt(PHPCrawlerURLDescriptor $Url, $user_agent_string)
 {
     PHPCrawlerBenchmark::start("processing_robotstxt");
     // URL of robots-txt
     $RobotsTxtUrl = self::getRobotsTxtURL($Url);
     // Get robots.txt-content related to the given URL
     $robots_txt_content = $this->getRobotsTxtContent($RobotsTxtUrl);
     $non_follow_reg_exps = array();
     // If content was found
     if ($robots_txt_content != null) {
         // Get all lines in the robots.txt-content that are adressed to our user-agent.
         $applying_lines = $this->getApplyingLines($robots_txt_content, $user_agent_string);
         // Get valid reg-expressions for the given disallow-pathes.
         $non_follow_reg_exps = $this->buildRegExpressions($applying_lines, PHPCrawlerUtils::getRootUrl($Url->url_rebuild));
     }
     PHPCrawlerBenchmark::stop("processing_robots.txt");
     return $non_follow_reg_exps;
 }
 /**
  * Parses a robots.txt-file and returns regular-expression-rules corresponding to the containing "disallow"-rules
  * that are adressed to the given user-agent.
  *
  * @param PHPCrawlerURLDescriptor $BaseUrl           The root-URL all rules from the robots-txt-file should relate to
  * @param string                  $user_agent_string The useragent all rules from the robots-txt-file should relate to
  * @param string                  $robots_txt_uri    Optional. The location of the robots.txt-file as URI.
  *                                                   If not set, the default robots.txt-file for the given BaseUrl gets parsed.
  *
  * @return array Numeric array containing regular-expressions for each "disallow"-rule defined in the robots.txt-file
  *               that's adressed to the given user-agent.
  */
 public function parseRobotsTxt(PHPCrawlerURLDescriptor $BaseUrl, $user_agent_string, $robots_txt_uri = null)
 {
     PHPCrawlerBenchmark::start("processing_robotstxt");
     // If robots_txt_uri not given, use the default one for the given BaseUrl
     if ($robots_txt_uri === null) {
         $robots_txt_uri = self::getRobotsTxtURL($BaseUrl->url_rebuild);
     }
     // Get robots.txt-content
     $robots_txt_content = PHPCrawlerUtils::getURIContent($robots_txt_uri, $user_agent_string);
     $non_follow_reg_exps = array();
     // If content was found
     if ($robots_txt_content != null) {
         // Get all lines in the robots.txt-content that are adressed to our user-agent.
         $applying_lines = $this->getUserAgentLines($robots_txt_content, $user_agent_string);
         // Get valid reg-expressions for the given disallow-pathes.
         $non_follow_reg_exps = $this->buildRegExpressions($applying_lines, PHPCrawlerUtils::getRootUrl($BaseUrl->url_rebuild));
     }
     PHPCrawlerBenchmark::stop("processing_robots.txt");
     return $non_follow_reg_exps;
 }