public function testCrawlBFSMaxQueueSize() { $this->spider->setTraversalAlgorithm(Spider::ALGORITHM_BREADTH_FIRST); $this->spider->setMaxDepth(1000); $this->spider->setMaxQueueSize(3); $this->spider->crawl(); $expected = array($this->hrefA, $this->hrefB, $this->hrefC); $stats = $this->spider->getStatsHandler(); foreach ($stats->getQueued() as $index => $uri) { $this->assertEquals($expected[$index], $uri->toString()); } }
require_once 'example_complex_bootstrap.php'; // The URI we want to start crawling with $seed = 'http://www.dmoz.org/Computers/Internet/'; // We want to allow all subdomains of dmoz.org $allowSubDomains = true; // Create spider $spider = new Spider($seed); // Set some sane defaults for this example. We only visit the first level of www.dmoz.org. We stop at 10 queued resources $spider->setMaxDepth(1); $spider->setMaxQueueSize(10); // We add an URI discoverer. Without it, the spider wouldn't get past the seed resource. $spider->addDiscoverer(new XPathExpressionDiscoverer("//div[@class='dir-1 borN'][2]//a")); // Let's tell the spider to save all found resources on the filesystem $spider->setPersistenceHandler(new \VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler(__DIR__ . '/results')); // This time, we set the traversal algorithm to breadth-first. The default is depth-first $spider->setTraversalAlgorithm(Spider::ALGORITHM_BREADTH_FIRST); // Add some prefetch filters. These are executed before a resource is requested. // The more you have of these, the less HTTP requests and work for the processors $spider->addPreFetchFilter(new AllowedSchemeFilter(array('http'))); $spider->addPreFetchFilter(new AllowedHostsFilter(array($seed), $allowSubDomains)); $spider->addPreFetchFilter(new UriWithHashFragmentFilter()); $spider->addPreFetchFilter(new UriWithQueryStringFilter()); // We add an eventlistener to the crawler that implements a politeness policy. We wait 450ms between every request to the same domain $politenessPolicyEventListener = new PolitenessPolicyListener(450); $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($politenessPolicyEventListener, 'onCrawlPreRequest')); // Let's add a CLI progress meter for fun echo "\nCrawling"; $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_ENQUEUE, function (Event $event) { echo '.'; }); //// Set up some caching, logging and profiling on the HTTP client of the spider