/** * @covers VDB\Spider\Spider::crawl */ public function testCrawlFailedRequest() { $this->requestHandler->expects($this->any())->method('request')->will($this->throwException(new Exception('Failed mock request!'))); $this->spider->crawl(); $stats = $this->spider->getStatsHandler(); $this->assertCount(0, $stats->getFiltered(), 'Filtered count'); $this->assertCount(0, $stats->getQueued(), 'Queued count'); $this->assertCount(1, $stats->getFailed(), 'Failed count'); }
<?php use VDB\Spider\Discoverer\XPathExpressionDiscoverer; use VDB\Spider\Spider; require_once __DIR__ . '/../vendor/autoload.php'; // Create Spider $spider = new Spider('http://www.dmoz.org'); // Add a URI discoverer. Without it, the spider does nothing. In this case, we want <a> tags from a certain <div> $spider->addDiscoverer(new XPathExpressionDiscoverer("//div[@id='catalogs']//a")); // Set some sane options for this example. In this case, we only get the first 10 items from the start page. $spider->setMaxDepth(1); $spider->setMaxQueueSize(10); // Execute crawl $spider->crawl(); // Report $stats = $spider->getStatsHandler(); echo "\nSPIDER ID: " . $stats->getSpiderId(); echo "\n ENQUEUED: " . count($stats->getQueued()); echo "\n SKIPPED: " . count($stats->getFiltered()); echo "\n FAILED: " . count($stats->getFailed()); // Finally we could do some processing on the downloaded resources // In this example, we will echo the title of all resources echo "\n\nDOWNLOADED RESOURCES: "; foreach ($spider->getPersistenceHandler() as $resource) { echo "\n - " . $resource->getCrawler()->filterXpath('//title')->text(); }
/** * @covers VDB\Spider\Spider */ public function testCrawlFailedRequest() { $this->requestHandler->expects($this->any())->method('request')->will($this->throwException(new Exception('Failed mock request!'))); $this->spider->crawl(); $this->assertCount(0, $this->spider->getDownloader()->getPersistenceHandler(), 'Persisted count'); }
// Let's add something to enable us to stop the script $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_USER_STOPPED, function (Event $event) { echo "\nCrawl aborted by user.\n"; exit; }); // Let's add a CLI progress meter for fun echo "\nCrawling"; $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, function (Event $event) { echo '.'; }); // Set up some caching, logging and profiling on the HTTP client of the spider $guzzleClient = $spider->getDownloader()->getRequestHandler()->getClient(); $tapMiddleware = Middleware::tap([$timerMiddleware, 'onRequest'], [$timerMiddleware, 'onResponse']); $guzzleClient->getConfig('handler')->push($tapMiddleware, 'timer'); // Execute the crawl $result = $spider->crawl(); // Report echo "\n\nSPIDER ID: " . $statsHandler->getSpiderId(); echo "\n ENQUEUED: " . count($statsHandler->getQueued()); echo "\n SKIPPED: " . count($statsHandler->getFiltered()); echo "\n FAILED: " . count($statsHandler->getFailed()); echo "\n PERSISTED: " . count($statsHandler->getPersisted()); // With the information from some of plugins and listeners, we can determine some metrics $peakMem = round(memory_get_peak_usage(true) / 1024 / 1024, 2); $totalTime = round(microtime(true) - $start, 2); $totalDelay = round($politenessPolicyEventListener->totalDelay / 1000 / 1000, 2); echo "\n\nMETRICS:"; echo "\n PEAK MEM USAGE: " . $peakMem . 'MB'; echo "\n TOTAL TIME: " . $totalTime . 's'; echo "\n REQUEST TIME: " . $timerMiddleware->getTotal() . 's'; echo "\n POLITENESS WAIT TIME: " . $totalDelay . 's';
/** * Start Parsing Urls! */ public function startParser() { $start = microtime(); $spider = new Spider($this->seed); if ($this->downloadLimit > 0) { $spider->getDownloader()->setDownloadLimit($this->downloadLimit); } $statsHandler = new StatsHandler(); $LogHandler = new Logger(\Pimcore::inDebugMode()); $queueManager = new InMemoryQueueManager(); $queueManager->getDispatcher()->addSubscriber($statsHandler); $queueManager->getDispatcher()->addSubscriber($LogHandler); $spider->getDiscovererSet()->maxDepth = $this->maxLinkDepth; $queueManager->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); $spider->setQueueManager($queueManager); $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//link[@hreflang]|//a[not(@rel='nofollow')]")); $spider->getDiscovererSet()->addFilter(new AllowedSchemeFilter($this->allowedSchemes)); $spider->getDiscovererSet()->addFilter(new AllowedHostsFilter(array($this->seed), $this->allowSubDomains)); $spider->getDiscovererSet()->addFilter(new UriWithHashFragmentFilter()); $spider->getDiscovererSet()->addFilter(new UriWithQueryStringFilter()); $spider->getDiscovererSet()->addFilter(new UriFilter($this->invalidLinkRegexes)); $spider->getDiscovererSet()->addFilter(new NegativeUriFilter($this->validLinkRegexes)); $politenessPolicyEventListener = new PolitenessPolicyListener(20); //CHANGE TO 100 !!!! $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($politenessPolicyEventListener, 'onCrawlPreRequest')); $spider->getDispatcher()->addSubscriber($statsHandler); $spider->getDispatcher()->addSubscriber($LogHandler); $abortListener = new Listener\Abort($spider); $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($abortListener, 'checkCrawlerState')); $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_USER_STOPPED, array($abortListener, 'stopCrawler')); if ($this->useAuth) { $authListener = new Listener\Auth($this->authUserName, $this->authPassword); $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($authListener, 'setAuth')); } $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, function (Event $event) { //echo 'crawling: ' . $event->getArgument('uri')->toString() . "\n"; }); // Execute the crawl $spider->crawl(); \Pimcore\Logger::debug("SPIDER ID: " . $statsHandler->getSpiderId()); \Pimcore\Logger::debug("SPIDER ID: " . $statsHandler->getSpiderId()); \Pimcore\Logger::debug("ENQUEUED: " . count($statsHandler->getQueued())); \Pimcore\Logger::debug("SKIPPED: " . count($statsHandler->getFiltered())); \Pimcore\Logger::debug("FAILED: " . count($statsHandler->getFailed())); \Pimcore\Logger::debug("PERSISTED: " . count($statsHandler->getPersisted())); $peakMem = round(memory_get_peak_usage(TRUE) / 1024 / 1024, 2); $totalTime = round(microtime(TRUE) - $start, 2); $totalDelay = round($politenessPolicyEventListener->totalDelay / 1000 / 1000, 2); \Pimcore\Logger::debug("PEAK MEM USAGE: " . $peakMem . 'MB'); \Pimcore\Logger::debug("TOTAL TIME: " . $totalTime . 's'); \Pimcore\Logger::debug("POLITENESS WAIT TIME: " . $totalDelay . 's'); $downloaded = $spider->getDownloader()->getPersistenceHandler(); //parse all resources! foreach ($downloaded as $resource) { $this->parseResponse($resource); } }