Example #1
0
 /**
  * @covers VDB\Spider\Spider::crawl
  */
 public function testCrawlFailedRequest()
 {
     $this->requestHandler->expects($this->any())->method('request')->will($this->throwException(new Exception('Failed mock request!')));
     $this->spider->crawl();
     $stats = $this->spider->getStatsHandler();
     $this->assertCount(0, $stats->getFiltered(), 'Filtered count');
     $this->assertCount(0, $stats->getQueued(), 'Queued count');
     $this->assertCount(1, $stats->getFailed(), 'Failed count');
 }
Example #2
0
<?php

use VDB\Spider\Discoverer\XPathExpressionDiscoverer;
use VDB\Spider\Spider;
require_once __DIR__ . '/../vendor/autoload.php';
// Create Spider
$spider = new Spider('http://www.dmoz.org');
// Add a URI discoverer. Without it, the spider does nothing. In this case, we want <a> tags from a certain <div>
$spider->addDiscoverer(new XPathExpressionDiscoverer("//div[@id='catalogs']//a"));
// Set some sane options for this example. In this case, we only get the first 10 items from the start page.
$spider->setMaxDepth(1);
$spider->setMaxQueueSize(10);
// Execute crawl
$spider->crawl();
// Report
$stats = $spider->getStatsHandler();
echo "\nSPIDER ID: " . $stats->getSpiderId();
echo "\n  ENQUEUED:  " . count($stats->getQueued());
echo "\n  SKIPPED:   " . count($stats->getFiltered());
echo "\n  FAILED:    " . count($stats->getFailed());
// Finally we could do some processing on the downloaded resources
// In this example, we will echo the title of all resources
echo "\n\nDOWNLOADED RESOURCES: ";
foreach ($spider->getPersistenceHandler() as $resource) {
    echo "\n - " . $resource->getCrawler()->filterXpath('//title')->text();
}
Example #3
0
 /**
  * @covers VDB\Spider\Spider
  */
 public function testCrawlFailedRequest()
 {
     $this->requestHandler->expects($this->any())->method('request')->will($this->throwException(new Exception('Failed mock request!')));
     $this->spider->crawl();
     $this->assertCount(0, $this->spider->getDownloader()->getPersistenceHandler(), 'Persisted count');
 }
Example #4
0
// Let's add something to enable us to stop the script
$spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_USER_STOPPED, function (Event $event) {
    echo "\nCrawl aborted by user.\n";
    exit;
});
// Let's add a CLI progress meter for fun
echo "\nCrawling";
$spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, function (Event $event) {
    echo '.';
});
// Set up some caching, logging and profiling on the HTTP client of the spider
$guzzleClient = $spider->getDownloader()->getRequestHandler()->getClient();
$tapMiddleware = Middleware::tap([$timerMiddleware, 'onRequest'], [$timerMiddleware, 'onResponse']);
$guzzleClient->getConfig('handler')->push($tapMiddleware, 'timer');
// Execute the crawl
$result = $spider->crawl();
// Report
echo "\n\nSPIDER ID: " . $statsHandler->getSpiderId();
echo "\n  ENQUEUED:  " . count($statsHandler->getQueued());
echo "\n  SKIPPED:   " . count($statsHandler->getFiltered());
echo "\n  FAILED:    " . count($statsHandler->getFailed());
echo "\n  PERSISTED:    " . count($statsHandler->getPersisted());
// With the information from some of plugins and listeners, we can determine some metrics
$peakMem = round(memory_get_peak_usage(true) / 1024 / 1024, 2);
$totalTime = round(microtime(true) - $start, 2);
$totalDelay = round($politenessPolicyEventListener->totalDelay / 1000 / 1000, 2);
echo "\n\nMETRICS:";
echo "\n  PEAK MEM USAGE:       " . $peakMem . 'MB';
echo "\n  TOTAL TIME:           " . $totalTime . 's';
echo "\n  REQUEST TIME:         " . $timerMiddleware->getTotal() . 's';
echo "\n  POLITENESS WAIT TIME: " . $totalDelay . 's';
 /**
  * Start Parsing Urls!
  */
 public function startParser()
 {
     $start = microtime();
     $spider = new Spider($this->seed);
     if ($this->downloadLimit > 0) {
         $spider->getDownloader()->setDownloadLimit($this->downloadLimit);
     }
     $statsHandler = new StatsHandler();
     $LogHandler = new Logger(\Pimcore::inDebugMode());
     $queueManager = new InMemoryQueueManager();
     $queueManager->getDispatcher()->addSubscriber($statsHandler);
     $queueManager->getDispatcher()->addSubscriber($LogHandler);
     $spider->getDiscovererSet()->maxDepth = $this->maxLinkDepth;
     $queueManager->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST);
     $spider->setQueueManager($queueManager);
     $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//link[@hreflang]|//a[not(@rel='nofollow')]"));
     $spider->getDiscovererSet()->addFilter(new AllowedSchemeFilter($this->allowedSchemes));
     $spider->getDiscovererSet()->addFilter(new AllowedHostsFilter(array($this->seed), $this->allowSubDomains));
     $spider->getDiscovererSet()->addFilter(new UriWithHashFragmentFilter());
     $spider->getDiscovererSet()->addFilter(new UriWithQueryStringFilter());
     $spider->getDiscovererSet()->addFilter(new UriFilter($this->invalidLinkRegexes));
     $spider->getDiscovererSet()->addFilter(new NegativeUriFilter($this->validLinkRegexes));
     $politenessPolicyEventListener = new PolitenessPolicyListener(20);
     //CHANGE TO 100 !!!!
     $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($politenessPolicyEventListener, 'onCrawlPreRequest'));
     $spider->getDispatcher()->addSubscriber($statsHandler);
     $spider->getDispatcher()->addSubscriber($LogHandler);
     $abortListener = new Listener\Abort($spider);
     $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($abortListener, 'checkCrawlerState'));
     $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_USER_STOPPED, array($abortListener, 'stopCrawler'));
     if ($this->useAuth) {
         $authListener = new Listener\Auth($this->authUserName, $this->authPassword);
         $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($authListener, 'setAuth'));
     }
     $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, function (Event $event) {
         //echo 'crawling: ' . $event->getArgument('uri')->toString() . "\n";
     });
     // Execute the crawl
     $spider->crawl();
     \Pimcore\Logger::debug("SPIDER ID: " . $statsHandler->getSpiderId());
     \Pimcore\Logger::debug("SPIDER ID: " . $statsHandler->getSpiderId());
     \Pimcore\Logger::debug("ENQUEUED:  " . count($statsHandler->getQueued()));
     \Pimcore\Logger::debug("SKIPPED:   " . count($statsHandler->getFiltered()));
     \Pimcore\Logger::debug("FAILED:    " . count($statsHandler->getFailed()));
     \Pimcore\Logger::debug("PERSISTED: " . count($statsHandler->getPersisted()));
     $peakMem = round(memory_get_peak_usage(TRUE) / 1024 / 1024, 2);
     $totalTime = round(microtime(TRUE) - $start, 2);
     $totalDelay = round($politenessPolicyEventListener->totalDelay / 1000 / 1000, 2);
     \Pimcore\Logger::debug("PEAK MEM USAGE:       " . $peakMem . 'MB');
     \Pimcore\Logger::debug("TOTAL TIME:           " . $totalTime . 's');
     \Pimcore\Logger::debug("POLITENESS WAIT TIME: " . $totalDelay . 's');
     $downloaded = $spider->getDownloader()->getPersistenceHandler();
     //parse all resources!
     foreach ($downloaded as $resource) {
         $this->parseResponse($resource);
     }
 }