示例#1
0
 /**
  * @covers VDB\Spider\Spider
  */
 public function testCrawlBFSMaxQueueSize()
 {
     $this->spider->getQueueManager()->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST);
     $this->spider->getDiscovererSet()->maxDepth = 1000;
     $this->spider->getDownloader()->setDownloadLimit(3);
     $this->spider->crawl();
     $expected = array($this->linkA, $this->linkB, $this->linkC);
     $this->compareUriArray($expected, $this->spider->getDownloader()->getPersistenceHandler());
 }
示例#2
0
<?php

use VDB\Spider\Discoverer\XPathExpressionDiscoverer;
use Symfony\Component\EventDispatcher\Event;
use VDB\Spider\Event\SpiderEvents;
use VDB\Spider\StatsHandler;
use VDB\Spider\Spider;
require_once __DIR__ . '/../vendor/autoload.php';
// Create Spider
$spider = new Spider('http://www.dmoz.org');
// Add a URI discoverer. Without it, the spider does nothing. In this case, we want <a> tags from a certain <div>
$spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//div[@id='catalogs']//a"));
// Set some sane options for this example. In this case, we only get the first 10 items from the start page.
$spider->getDiscovererSet()->maxDepth = 1;
$spider->getQueueManager()->maxQueueSize = 10;
// Let's add something to enable us to stop the script
$spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_USER_STOPPED, function (Event $event) {
    echo "\nCrawl aborted by user.\n";
    exit;
});
// Add a listener to collect stats to the Spider and the QueueMananger.
// There are more components that dispatch events you can use.
$statsHandler = new StatsHandler();
$spider->getQueueManager()->getDispatcher()->addSubscriber($statsHandler);
$spider->getDispatcher()->addSubscriber($statsHandler);
// Execute crawl
$spider->crawl();
// Build a report
echo "\n  ENQUEUED:  " . count($statsHandler->getQueued());
echo "\n  SKIPPED:   " . count($statsHandler->getFiltered());
echo "\n  FAILED:    " . count($statsHandler->getFailed());