/**
  * @link https://github.com/t1gor/Robots.txt-Parser-Class/issues/22
  */
 public function testAllowWildcard()
 {
     // init parser
     $parser = new RobotsTxtParser("\n\t\t\tUser-agent: *\n\t\t\tAllow: /\n\t\t");
     // asserts
     $this->assertFalse($parser->isDisallowed("/index"));
     $this->assertFalse($parser->isDisallowed("/"));
     $this->assertTrue($parser->isAllowed("/index"));
     $this->assertTrue($parser->isAllowed("/"));
 }
 /**
  * @dataProvider generateDataForTest
  * @covers RobotsTxtParser::isDisallowed
  * @covers RobotsTxtParser::checkRule
  * @param string $robotsTxtContent
  */
 public function testCrawlDelay($robotsTxtContent)
 {
     // init parser
     $parser = new RobotsTxtParser($robotsTxtContent);
     $rules = $parser->getRules();
     $this->assertInstanceOf('RobotsTxtParser', $parser);
     $this->assertObjectHasAttribute('rules', $parser);
     $this->assertArrayHasKey('ahrefsbot', $rules);
     $this->assertArrayHasKey('crawl-delay', $rules['ahrefsbot']);
     $this->assertEquals(1.5, $rules['ahrefsbot']['crawl-delay']);
 }
 /**
  * @link https://help.yandex.ru/webmaster/controlling-robot/robots-txt.xml#clean-param
  *
  * @dataProvider generateDataForTest
  * @covers RobotsTxtParser::isDisallowed
  * @covers RobotsTxtParser::checkRule
  * @param string $robotsTxtContent
  */
 public function testCleanParam($robotsTxtContent, $message = NULL)
 {
     // init parser
     $parser = new RobotsTxtParser($robotsTxtContent);
     $rules = $parser->getRules();
     $this->assertInstanceOf('RobotsTxtParser', $parser);
     $this->assertObjectHasAttribute('rules', $parser);
     $this->assertArrayHasKey('*', $rules);
     $this->assertArrayHasKey('clean-param', $rules['*']);
     $this->assertEquals(array('utm_source&utm_medium&utm.campaign'), $rules['*']['clean-param'], $message);
 }
 /**
  * @dataProvider generateDataForTest
  * @covers RobotsTxtParser::isDisallowed
  * @covers RobotsTxtParser::checkRule
  * @param string $robotsTxtContent
  */
 public function testHost($robotsTxtContent)
 {
     // init parser
     $parser = new RobotsTxtParser($robotsTxtContent);
     $rules = $parser->getRules();
     $this->assertInstanceOf('RobotsTxtParser', $parser);
     $this->assertObjectHasAttribute('rules', $parser);
     $this->assertArrayHasKey('*', $rules);
     $this->assertArrayHasKey('host', $rules['*']);
     $this->assertEquals('www.example.com', $rules['*']['host']);
 }
function get_robots_tester()
{
    $root = (!empty($_SERVER['HTTPS']) ? 'https' : 'http') . '://' . $_SERVER['HTTP_HOST'] . '/';
    if (class_exists('\\RobotsTxtParser')) {
        $parser = new RobotsTxtParser(file_get_contents($root . 'robots.txt'));
        $parser->setUserAgent('GoogleBot');
        $status = $parser->isAllowed('/') ? 'Enable' : 'Blocked';
        $output = '<tr>';
        $output .= '<td>' . 'Robots Checking ' . $root . 'robots.txt' . '</td><td>' . ' Enable ' . '</td>';
        $output .= '<td  style="color:' . ($parser->isAllowed('/') ? 'green' : 'red') . ';">' . $status . '</td>';
        $output .= '<td>' . ($parser->isAllowed('/') ? 'Passed' : 'FAILED') . '</td>';
        $output .= '</tr>';
        echo $output;
    }
}
Example #6
0
 /**
  * Returns content of URL
  *
  * @param string $url Any valid URL
  * @param string $actionType "GET", "POST", any other...
  * @return null|\Symfony\Component\DomCrawler\Crawler
  */
 protected function getContentOfUrl($url, $actionType = 'GET', $listenRobotsDotTxt = true)
 {
     if (!$url) {
         return null;
     }
     // Check if url is allowed
     if ($listenRobotsDotTxt && $this->robotsTxtContent) {
         $parser = new \RobotsTxtParser($this->robotsTxtContent);
         // $parser->setUserAgent('VeiktDotComBot'); // ???
         if ($parser->isDisallowed($url)) {
             return null;
         }
     }
     $goutteClient = new GoutteClient();
     $guzzleClient = new GuzzleClient(array('curl' => array(CURLOPT_TIMEOUT => $this::CURLOPT_TIMEOUT, CURLOPT_CONNECTTIMEOUT => $this::CURLOPT_CONNECTTIMEOUT)));
     $goutteClient->setClient($guzzleClient);
     $result = $goutteClient->request($actionType, $url);
     if (!$result) {
         return null;
     }
     return $result;
 }
Example #7
0
 public function allowCrawl($url)
 {
     $file = robot_parser::getRobotFile($url);
     $parser = new RobotsTxtParser($file);
     return $parser->isAllowed($url);
 }
 /**
  * @covder RobotsTxtParser::checkRule
  * @link https://github.com/t1gor/Robots.txt-Parser-Class/issues/23
  */
 public function testEmptyRulesAllow()
 {
     $parser = new RobotsTxtParser('');
     $this->assertTrue($parser->isAllowed('/foo'));
     $this->assertFalse($parser->isDisallowed('/foo'));
 }