/** * Sets up the fixture, for example, opens a network connection. * This method is called before a test is executed. */ protected function setUp() { $this->spider = new Spider('http://php-spider.org/A'); $this->requestHandler = $this->getMock('VDB\\Spider\\RequestHandler\\RequestHandler'); $this->hrefA = 'http://php-spider.org/A'; $this->hrefB = 'http://php-spider.org/B'; $this->hrefC = 'http://php-spider.org/C'; $this->hrefD = 'http://php-spider.org/D'; $this->hrefE = 'http://php-spider.org/E'; $this->hrefF = 'http://php-spider.org/F'; $this->hrefG = 'http://php-spider.org/G'; $this->linkA = new Uri($this->hrefA); $this->linkB = new Uri($this->hrefB); $this->linkC = new Uri($this->hrefC); $this->linkD = new Uri($this->hrefD); $this->linkE = new Uri($this->hrefE); $this->linkF = new Uri($this->hrefF); $this->linkG = new Uri($this->hrefG); $htmlA = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceA.html'); $this->responseA = new Response(200, null, $htmlA); $htmlB = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceB.html'); $this->responseB = new Response(200, null, $htmlB); $htmlC = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceC.html'); $this->responseC = new Response(200, null, $htmlC); $htmlD = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceD.html'); $this->responseD = new Response(200, null, $htmlD); $htmlE = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceE.html'); $this->responseE = new Response(200, null, $htmlE); $htmlF = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceF.html'); $this->responseF = new Response(200, null, $htmlF); $htmlG = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceG.html'); $this->responseG = new Response(200, null, $htmlG); $this->requestHandler->expects($this->any())->method('request')->will($this->returnCallback(array($this, 'doTestRequest'))); $this->spider->setRequestHandler($this->requestHandler); $this->spider->addDiscoverer(new XPathExpressionDiscoverer('//a')); }
<?php use VDB\Spider\Discoverer\XPathExpressionDiscoverer; use VDB\Spider\Spider; require_once __DIR__ . '/../vendor/autoload.php'; // Create Spider $spider = new Spider('http://www.dmoz.org'); // Add a URI discoverer. Without it, the spider does nothing. In this case, we want <a> tags from a certain <div> $spider->addDiscoverer(new XPathExpressionDiscoverer("//div[@id='catalogs']//a")); // Set some sane options for this example. In this case, we only get the first 10 items from the start page. $spider->setMaxDepth(1); $spider->setMaxQueueSize(10); // Execute crawl $spider->crawl(); // Report $stats = $spider->getStatsHandler(); echo "\nSPIDER ID: " . $stats->getSpiderId(); echo "\n ENQUEUED: " . count($stats->getQueued()); echo "\n SKIPPED: " . count($stats->getFiltered()); echo "\n FAILED: " . count($stats->getFailed()); // Finally we could do some processing on the downloaded resources // In this example, we will echo the title of all resources echo "\n\nDOWNLOADED RESOURCES: "; foreach ($spider->getPersistenceHandler() as $resource) { echo "\n - " . $resource->getCrawler()->filterXpath('//title')->text(); }
use VDB\Spider\Filter\Prefetch\AllowedSchemeFilter; use VDB\Spider\Filter\Prefetch\UriWithHashFragmentFilter; use VDB\Spider\Filter\Prefetch\UriWithQueryStringFilter; use VDB\Spider\Spider; require_once 'example_complex_bootstrap.php'; // The URI we want to start crawling with $seed = 'http://www.dmoz.org/Computers/Internet/'; // We want to allow all subdomains of dmoz.org $allowSubDomains = true; // Create spider $spider = new Spider($seed); // Set some sane defaults for this example. We only visit the first level of www.dmoz.org. We stop at 10 queued resources $spider->setMaxDepth(1); $spider->setMaxQueueSize(10); // We add an URI discoverer. Without it, the spider wouldn't get past the seed resource. $spider->addDiscoverer(new XPathExpressionDiscoverer("//div[@class='dir-1 borN'][2]//a")); // Let's tell the spider to save all found resources on the filesystem $spider->setPersistenceHandler(new \VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler(__DIR__ . '/results')); // This time, we set the traversal algorithm to breadth-first. The default is depth-first $spider->setTraversalAlgorithm(Spider::ALGORITHM_BREADTH_FIRST); // Add some prefetch filters. These are executed before a resource is requested. // The more you have of these, the less HTTP requests and work for the processors $spider->addPreFetchFilter(new AllowedSchemeFilter(array('http'))); $spider->addPreFetchFilter(new AllowedHostsFilter(array($seed), $allowSubDomains)); $spider->addPreFetchFilter(new UriWithHashFragmentFilter()); $spider->addPreFetchFilter(new UriWithQueryStringFilter()); // We add an eventlistener to the crawler that implements a politeness policy. We wait 450ms between every request to the same domain $politenessPolicyEventListener = new PolitenessPolicyListener(450); $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($politenessPolicyEventListener, 'onCrawlPreRequest')); // Let's add a CLI progress meter for fun echo "\nCrawling";