Пример #1
0
 /**
  * @param Spider $spider
  * @param Resource $document
  * @return UriInterface[]
  */
 public function discover(Spider $spider, Resource $document)
 {
     $crawler = $document->getCrawler()->filter($this->cssSelector);
     $uris = array();
     foreach ($crawler as $node) {
         try {
             $uris[] = new Uri($node->getAttribute('href'), $document->getUri()->toString());
         } catch (UriSyntaxException $e) {
             $spider->getStatsHandler()->addToFailed($node->getAttribute('href'), 'Invalid URI: ' . $e->getMessage());
         }
     }
     return $uris;
 }
Пример #2
0
 /**
  * @covers VDB\Spider\Spider::crawl
  */
 public function testCrawlFailedRequest()
 {
     $this->requestHandler->expects($this->any())->method('request')->will($this->throwException(new Exception('Failed mock request!')));
     $this->spider->crawl();
     $stats = $this->spider->getStatsHandler();
     $this->assertCount(0, $stats->getFiltered(), 'Filtered count');
     $this->assertCount(0, $stats->getQueued(), 'Queued count');
     $this->assertCount(1, $stats->getFailed(), 'Failed count');
 }
Пример #3
0
<?php

use VDB\Spider\Discoverer\XPathExpressionDiscoverer;
use VDB\Spider\Spider;
require_once __DIR__ . '/../vendor/autoload.php';
// Create Spider
$spider = new Spider('http://www.dmoz.org');
// Add a URI discoverer. Without it, the spider does nothing. In this case, we want <a> tags from a certain <div>
$spider->addDiscoverer(new XPathExpressionDiscoverer("//div[@id='catalogs']//a"));
// Set some sane options for this example. In this case, we only get the first 10 items from the start page.
$spider->setMaxDepth(1);
$spider->setMaxQueueSize(10);
// Execute crawl
$spider->crawl();
// Report
$stats = $spider->getStatsHandler();
echo "\nSPIDER ID: " . $stats->getSpiderId();
echo "\n  ENQUEUED:  " . count($stats->getQueued());
echo "\n  SKIPPED:   " . count($stats->getFiltered());
echo "\n  FAILED:    " . count($stats->getFailed());
// Finally we could do some processing on the downloaded resources
// In this example, we will echo the title of all resources
echo "\n\nDOWNLOADED RESOURCES: ";
foreach ($spider->getPersistenceHandler() as $resource) {
    echo "\n - " . $resource->getCrawler()->filterXpath('//title')->text();
}
Пример #4
0
use VDB\Spider\Filter\Prefetch\AllowedHostsFilter;
use VDB\Spider\Filter\Prefetch\AllowedSchemeFilter;
use VDB\Spider\Filter\Prefetch\UriWithHashFragmentFilter;
use VDB\Spider\Filter\Prefetch\UriWithQueryStringFilter;
use VDB\Spider\QueueManager\InMemoryQueueManager;
use VDB\Spider\Spider;
use VDB\Spider\StatsHandler;
use VDB\Spider\LogHandler;
use GuzzleHttp\Middleware;
require_once 'example_complex_bootstrap.php';
// The URI we want to start crawling with
$seed = 'http://www.dmoz.org/Computers/Internet/';
// We want to allow all subdomains of dmoz.org
$allowSubDomains = true;
// Create spider
$spider = new Spider($seed);
$spider->getDownloader()->setDownloadLimit(10);
$statsHandler = new StatsHandler();
$LogHandler = new LogHandler();
$queueManager = new InMemoryQueueManager();
$queueManager->getDispatcher()->addSubscriber($statsHandler);
$queueManager->getDispatcher()->addSubscriber($LogHandler);
// Set some sane defaults for this example. We only visit the first level of www.dmoz.org. We stop at 10 queued resources
$spider->getDiscovererSet()->maxDepth = 1;
// This time, we set the traversal algorithm to breadth-first. The default is depth-first
$queueManager->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST);
$spider->setQueueManager($queueManager);
// We add an URI discoverer. Without it, the spider wouldn't get past the seed resource.
$spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//div[@class='dir-1 borN'][2]//a"));
// Let's tell the spider to save all found resources on the filesystem
$spider->getDownloader()->setPersistenceHandler(new \VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler(__DIR__ . '/results'));
Пример #5
0
 /**
  * @covers VDB\Spider\Spider
  */
 public function testCrawlFailedRequest()
 {
     $this->requestHandler->expects($this->any())->method('request')->will($this->throwException(new Exception('Failed mock request!')));
     $this->spider->crawl();
     $this->assertCount(0, $this->spider->getDownloader()->getPersistenceHandler(), 'Persisted count');
 }
Пример #6
0
use Symfony\Component\EventDispatcher\Event;
use VDB\Spider\Discoverer\XPathExpressionDiscoverer;
use VDB\Spider\Event\SpiderEvents;
use VDB\Spider\EventListener\PolitenessPolicyListener;
use VDB\Spider\Filter\Prefetch\AllowedHostsFilter;
use VDB\Spider\Filter\Prefetch\AllowedSchemeFilter;
use VDB\Spider\Filter\Prefetch\UriWithHashFragmentFilter;
use VDB\Spider\Filter\Prefetch\UriWithQueryStringFilter;
use VDB\Spider\Spider;
require_once 'example_complex_bootstrap.php';
// The URI we want to start crawling with
$seed = 'http://www.dmoz.org/Computers/Internet/';
// We want to allow all subdomains of dmoz.org
$allowSubDomains = true;
// Create spider
$spider = new Spider($seed);
// Set some sane defaults for this example. We only visit the first level of www.dmoz.org. We stop at 10 queued resources
$spider->setMaxDepth(1);
$spider->setMaxQueueSize(10);
// We add an URI discoverer. Without it, the spider wouldn't get past the seed resource.
$spider->addDiscoverer(new XPathExpressionDiscoverer("//div[@class='dir-1 borN'][2]//a"));
// Let's tell the spider to save all found resources on the filesystem
$spider->setPersistenceHandler(new \VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler(__DIR__ . '/results'));
// This time, we set the traversal algorithm to breadth-first. The default is depth-first
$spider->setTraversalAlgorithm(Spider::ALGORITHM_BREADTH_FIRST);
// Add some prefetch filters. These are executed before a resource is requested.
// The more you have of these, the less HTTP requests and work for the processors
$spider->addPreFetchFilter(new AllowedSchemeFilter(array('http')));
$spider->addPreFetchFilter(new AllowedHostsFilter(array($seed), $allowSubDomains));
$spider->addPreFetchFilter(new UriWithHashFragmentFilter());
$spider->addPreFetchFilter(new UriWithQueryStringFilter());
Пример #7
0
<?php

use VDB\Spider\Discoverer\XPathExpressionDiscoverer;
use Symfony\Component\EventDispatcher\Event;
use VDB\Spider\Event\SpiderEvents;
use VDB\Spider\StatsHandler;
use VDB\Spider\Spider;
require_once __DIR__ . '/../vendor/autoload.php';
// Create Spider
$spider = new Spider('http://www.dmoz.org');
// Add a URI discoverer. Without it, the spider does nothing. In this case, we want <a> tags from a certain <div>
$spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//div[@id='catalogs']//a"));
// Set some sane options for this example. In this case, we only get the first 10 items from the start page.
$spider->getDiscovererSet()->maxDepth = 1;
$spider->getQueueManager()->maxQueueSize = 10;
// Let's add something to enable us to stop the script
$spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_USER_STOPPED, function (Event $event) {
    echo "\nCrawl aborted by user.\n";
    exit;
});
// Add a listener to collect stats to the Spider and the QueueMananger.
// There are more components that dispatch events you can use.
$statsHandler = new StatsHandler();
$spider->getQueueManager()->getDispatcher()->addSubscriber($statsHandler);
$spider->getDispatcher()->addSubscriber($statsHandler);
// Execute crawl
$spider->crawl();
// Build a report
echo "\n  ENQUEUED:  " . count($statsHandler->getQueued());
echo "\n  SKIPPED:   " . count($statsHandler->getFiltered());
echo "\n  FAILED:    " . count($statsHandler->getFailed());
 /**
  * Start Parsing Urls!
  */
 public function startParser()
 {
     $start = microtime();
     $spider = new Spider($this->seed);
     if ($this->downloadLimit > 0) {
         $spider->getDownloader()->setDownloadLimit($this->downloadLimit);
     }
     $statsHandler = new StatsHandler();
     $LogHandler = new Logger(\Pimcore::inDebugMode());
     $queueManager = new InMemoryQueueManager();
     $queueManager->getDispatcher()->addSubscriber($statsHandler);
     $queueManager->getDispatcher()->addSubscriber($LogHandler);
     $spider->getDiscovererSet()->maxDepth = $this->maxLinkDepth;
     $queueManager->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST);
     $spider->setQueueManager($queueManager);
     $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//link[@hreflang]|//a[not(@rel='nofollow')]"));
     $spider->getDiscovererSet()->addFilter(new AllowedSchemeFilter($this->allowedSchemes));
     $spider->getDiscovererSet()->addFilter(new AllowedHostsFilter(array($this->seed), $this->allowSubDomains));
     $spider->getDiscovererSet()->addFilter(new UriWithHashFragmentFilter());
     $spider->getDiscovererSet()->addFilter(new UriWithQueryStringFilter());
     $spider->getDiscovererSet()->addFilter(new UriFilter($this->invalidLinkRegexes));
     $spider->getDiscovererSet()->addFilter(new NegativeUriFilter($this->validLinkRegexes));
     $politenessPolicyEventListener = new PolitenessPolicyListener(20);
     //CHANGE TO 100 !!!!
     $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($politenessPolicyEventListener, 'onCrawlPreRequest'));
     $spider->getDispatcher()->addSubscriber($statsHandler);
     $spider->getDispatcher()->addSubscriber($LogHandler);
     $abortListener = new Listener\Abort($spider);
     $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($abortListener, 'checkCrawlerState'));
     $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_USER_STOPPED, array($abortListener, 'stopCrawler'));
     if ($this->useAuth) {
         $authListener = new Listener\Auth($this->authUserName, $this->authPassword);
         $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($authListener, 'setAuth'));
     }
     $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, function (Event $event) {
         //echo 'crawling: ' . $event->getArgument('uri')->toString() . "\n";
     });
     // Execute the crawl
     $spider->crawl();
     \Pimcore\Logger::debug("SPIDER ID: " . $statsHandler->getSpiderId());
     \Pimcore\Logger::debug("SPIDER ID: " . $statsHandler->getSpiderId());
     \Pimcore\Logger::debug("ENQUEUED:  " . count($statsHandler->getQueued()));
     \Pimcore\Logger::debug("SKIPPED:   " . count($statsHandler->getFiltered()));
     \Pimcore\Logger::debug("FAILED:    " . count($statsHandler->getFailed()));
     \Pimcore\Logger::debug("PERSISTED: " . count($statsHandler->getPersisted()));
     $peakMem = round(memory_get_peak_usage(TRUE) / 1024 / 1024, 2);
     $totalTime = round(microtime(TRUE) - $start, 2);
     $totalDelay = round($politenessPolicyEventListener->totalDelay / 1000 / 1000, 2);
     \Pimcore\Logger::debug("PEAK MEM USAGE:       " . $peakMem . 'MB');
     \Pimcore\Logger::debug("TOTAL TIME:           " . $totalTime . 's');
     \Pimcore\Logger::debug("POLITENESS WAIT TIME: " . $totalDelay . 's');
     $downloaded = $spider->getDownloader()->getPersistenceHandler();
     //parse all resources!
     foreach ($downloaded as $resource) {
         $this->parseResponse($resource);
     }
 }