示例#1
0
 /**
  * Sets up the fixture, for example, opens a network connection.
  * This method is called before a test is executed.
  */
 protected function setUp()
 {
     $this->spider = new Spider('http://php-spider.org/A');
     $this->requestHandler = $this->getMock('VDB\\Spider\\RequestHandler\\RequestHandler');
     $this->hrefA = 'http://php-spider.org/A';
     $this->hrefB = 'http://php-spider.org/B';
     $this->hrefC = 'http://php-spider.org/C';
     $this->hrefD = 'http://php-spider.org/D';
     $this->hrefE = 'http://php-spider.org/E';
     $this->hrefF = 'http://php-spider.org/F';
     $this->hrefG = 'http://php-spider.org/G';
     $this->linkA = new Uri($this->hrefA);
     $this->linkB = new Uri($this->hrefB);
     $this->linkC = new Uri($this->hrefC);
     $this->linkD = new Uri($this->hrefD);
     $this->linkE = new Uri($this->hrefE);
     $this->linkF = new Uri($this->hrefF);
     $this->linkG = new Uri($this->hrefG);
     $htmlA = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceA.html');
     $this->responseA = new Response(200, null, $htmlA);
     $htmlB = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceB.html');
     $this->responseB = new Response(200, null, $htmlB);
     $htmlC = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceC.html');
     $this->responseC = new Response(200, null, $htmlC);
     $htmlD = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceD.html');
     $this->responseD = new Response(200, null, $htmlD);
     $htmlE = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceE.html');
     $this->responseE = new Response(200, null, $htmlE);
     $htmlF = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceF.html');
     $this->responseF = new Response(200, null, $htmlF);
     $htmlG = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceG.html');
     $this->responseG = new Response(200, null, $htmlG);
     $this->requestHandler->expects($this->any())->method('request')->will($this->returnCallback(array($this, 'doTestRequest')));
     $this->spider->setRequestHandler($this->requestHandler);
     $this->spider->addDiscoverer(new XPathExpressionDiscoverer('//a'));
 }
示例#2
0
<?php

use VDB\Spider\Discoverer\XPathExpressionDiscoverer;
use VDB\Spider\Spider;
require_once __DIR__ . '/../vendor/autoload.php';
// Create Spider
$spider = new Spider('http://www.dmoz.org');
// Add a URI discoverer. Without it, the spider does nothing. In this case, we want <a> tags from a certain <div>
$spider->addDiscoverer(new XPathExpressionDiscoverer("//div[@id='catalogs']//a"));
// Set some sane options for this example. In this case, we only get the first 10 items from the start page.
$spider->setMaxDepth(1);
$spider->setMaxQueueSize(10);
// Execute crawl
$spider->crawl();
// Report
$stats = $spider->getStatsHandler();
echo "\nSPIDER ID: " . $stats->getSpiderId();
echo "\n  ENQUEUED:  " . count($stats->getQueued());
echo "\n  SKIPPED:   " . count($stats->getFiltered());
echo "\n  FAILED:    " . count($stats->getFailed());
// Finally we could do some processing on the downloaded resources
// In this example, we will echo the title of all resources
echo "\n\nDOWNLOADED RESOURCES: ";
foreach ($spider->getPersistenceHandler() as $resource) {
    echo "\n - " . $resource->getCrawler()->filterXpath('//title')->text();
}
示例#3
0
use VDB\Spider\Filter\Prefetch\AllowedSchemeFilter;
use VDB\Spider\Filter\Prefetch\UriWithHashFragmentFilter;
use VDB\Spider\Filter\Prefetch\UriWithQueryStringFilter;
use VDB\Spider\Spider;
require_once 'example_complex_bootstrap.php';
// The URI we want to start crawling with
$seed = 'http://www.dmoz.org/Computers/Internet/';
// We want to allow all subdomains of dmoz.org
$allowSubDomains = true;
// Create spider
$spider = new Spider($seed);
// Set some sane defaults for this example. We only visit the first level of www.dmoz.org. We stop at 10 queued resources
$spider->setMaxDepth(1);
$spider->setMaxQueueSize(10);
// We add an URI discoverer. Without it, the spider wouldn't get past the seed resource.
$spider->addDiscoverer(new XPathExpressionDiscoverer("//div[@class='dir-1 borN'][2]//a"));
// Let's tell the spider to save all found resources on the filesystem
$spider->setPersistenceHandler(new \VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler(__DIR__ . '/results'));
// This time, we set the traversal algorithm to breadth-first. The default is depth-first
$spider->setTraversalAlgorithm(Spider::ALGORITHM_BREADTH_FIRST);
// Add some prefetch filters. These are executed before a resource is requested.
// The more you have of these, the less HTTP requests and work for the processors
$spider->addPreFetchFilter(new AllowedSchemeFilter(array('http')));
$spider->addPreFetchFilter(new AllowedHostsFilter(array($seed), $allowSubDomains));
$spider->addPreFetchFilter(new UriWithHashFragmentFilter());
$spider->addPreFetchFilter(new UriWithQueryStringFilter());
// We add an eventlistener to the crawler that implements a politeness policy. We wait 450ms between every request to the same domain
$politenessPolicyEventListener = new PolitenessPolicyListener(450);
$spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($politenessPolicyEventListener, 'onCrawlPreRequest'));
// Let's add a CLI progress meter for fun
echo "\nCrawling";