/** * @covers VDB\Spider\Spider */ public function testCrawlBFSMaxQueueSize() { $this->spider->getQueueManager()->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); $this->spider->getDiscovererSet()->maxDepth = 1000; $this->spider->getDownloader()->setDownloadLimit(3); $this->spider->crawl(); $expected = array($this->linkA, $this->linkB, $this->linkC); $this->compareUriArray($expected, $this->spider->getDownloader()->getPersistenceHandler()); }
<?php use VDB\Spider\Discoverer\XPathExpressionDiscoverer; use Symfony\Component\EventDispatcher\Event; use VDB\Spider\Event\SpiderEvents; use VDB\Spider\StatsHandler; use VDB\Spider\Spider; require_once __DIR__ . '/../vendor/autoload.php'; // Create Spider $spider = new Spider('http://www.dmoz.org'); // Add a URI discoverer. Without it, the spider does nothing. In this case, we want <a> tags from a certain <div> $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//div[@id='catalogs']//a")); // Set some sane options for this example. In this case, we only get the first 10 items from the start page. $spider->getDiscovererSet()->maxDepth = 1; $spider->getQueueManager()->maxQueueSize = 10; // Let's add something to enable us to stop the script $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_USER_STOPPED, function (Event $event) { echo "\nCrawl aborted by user.\n"; exit; }); // Add a listener to collect stats to the Spider and the QueueMananger. // There are more components that dispatch events you can use. $statsHandler = new StatsHandler(); $spider->getQueueManager()->getDispatcher()->addSubscriber($statsHandler); $spider->getDispatcher()->addSubscriber($statsHandler); // Execute crawl $spider->crawl(); // Build a report echo "\n ENQUEUED: " . count($statsHandler->getQueued()); echo "\n SKIPPED: " . count($statsHandler->getFiltered()); echo "\n FAILED: " . count($statsHandler->getFailed());