/** * @link https://github.com/t1gor/Robots.txt-Parser-Class/issues/22 */ public function testAllowWildcard() { // init parser $parser = new RobotsTxtParser("\n\t\t\tUser-agent: *\n\t\t\tAllow: /\n\t\t"); // asserts $this->assertFalse($parser->isDisallowed("/index")); $this->assertFalse($parser->isDisallowed("/")); $this->assertTrue($parser->isAllowed("/index")); $this->assertTrue($parser->isAllowed("/")); }
/** * @dataProvider generateDataForTest * @covers RobotsTxtParser::isDisallowed * @covers RobotsTxtParser::checkRule * @param string $robotsTxtContent */ public function testCrawlDelay($robotsTxtContent) { // init parser $parser = new RobotsTxtParser($robotsTxtContent); $rules = $parser->getRules(); $this->assertInstanceOf('RobotsTxtParser', $parser); $this->assertObjectHasAttribute('rules', $parser); $this->assertArrayHasKey('ahrefsbot', $rules); $this->assertArrayHasKey('crawl-delay', $rules['ahrefsbot']); $this->assertEquals(1.5, $rules['ahrefsbot']['crawl-delay']); }
/** * @link https://help.yandex.ru/webmaster/controlling-robot/robots-txt.xml#clean-param * * @dataProvider generateDataForTest * @covers RobotsTxtParser::isDisallowed * @covers RobotsTxtParser::checkRule * @param string $robotsTxtContent */ public function testCleanParam($robotsTxtContent, $message = NULL) { // init parser $parser = new RobotsTxtParser($robotsTxtContent); $rules = $parser->getRules(); $this->assertInstanceOf('RobotsTxtParser', $parser); $this->assertObjectHasAttribute('rules', $parser); $this->assertArrayHasKey('*', $rules); $this->assertArrayHasKey('clean-param', $rules['*']); $this->assertEquals(array('utm_source&utm_medium&utm.campaign'), $rules['*']['clean-param'], $message); }
/** * @dataProvider generateDataForTest * @covers RobotsTxtParser::isDisallowed * @covers RobotsTxtParser::checkRule * @param string $robotsTxtContent */ public function testHost($robotsTxtContent) { // init parser $parser = new RobotsTxtParser($robotsTxtContent); $rules = $parser->getRules(); $this->assertInstanceOf('RobotsTxtParser', $parser); $this->assertObjectHasAttribute('rules', $parser); $this->assertArrayHasKey('*', $rules); $this->assertArrayHasKey('host', $rules['*']); $this->assertEquals('www.example.com', $rules['*']['host']); }
function get_robots_tester() { $root = (!empty($_SERVER['HTTPS']) ? 'https' : 'http') . '://' . $_SERVER['HTTP_HOST'] . '/'; if (class_exists('\\RobotsTxtParser')) { $parser = new RobotsTxtParser(file_get_contents($root . 'robots.txt')); $parser->setUserAgent('GoogleBot'); $status = $parser->isAllowed('/') ? 'Enable' : 'Blocked'; $output = '<tr>'; $output .= '<td>' . 'Robots Checking ' . $root . 'robots.txt' . '</td><td>' . ' Enable ' . '</td>'; $output .= '<td style="color:' . ($parser->isAllowed('/') ? 'green' : 'red') . ';">' . $status . '</td>'; $output .= '<td>' . ($parser->isAllowed('/') ? 'Passed' : 'FAILED') . '</td>'; $output .= '</tr>'; echo $output; } }
/** * Returns content of URL * * @param string $url Any valid URL * @param string $actionType "GET", "POST", any other... * @return null|\Symfony\Component\DomCrawler\Crawler */ protected function getContentOfUrl($url, $actionType = 'GET', $listenRobotsDotTxt = true) { if (!$url) { return null; } // Check if url is allowed if ($listenRobotsDotTxt && $this->robotsTxtContent) { $parser = new \RobotsTxtParser($this->robotsTxtContent); // $parser->setUserAgent('VeiktDotComBot'); // ??? if ($parser->isDisallowed($url)) { return null; } } $goutteClient = new GoutteClient(); $guzzleClient = new GuzzleClient(array('curl' => array(CURLOPT_TIMEOUT => $this::CURLOPT_TIMEOUT, CURLOPT_CONNECTTIMEOUT => $this::CURLOPT_CONNECTTIMEOUT))); $goutteClient->setClient($guzzleClient); $result = $goutteClient->request($actionType, $url); if (!$result) { return null; } return $result; }
public function allowCrawl($url) { $file = robot_parser::getRobotFile($url); $parser = new RobotsTxtParser($file); return $parser->isAllowed($url); }
/** * @covder RobotsTxtParser::checkRule * @link https://github.com/t1gor/Robots.txt-Parser-Class/issues/23 */ public function testEmptyRulesAllow() { $parser = new RobotsTxtParser(''); $this->assertTrue($parser->isAllowed('/foo')); $this->assertFalse($parser->isDisallowed('/foo')); }