/** * @param Spider $spider * @param Resource $document * @return UriInterface[] */ public function discover(Spider $spider, Resource $document) { $crawler = $document->getCrawler()->filter($this->cssSelector); $uris = array(); foreach ($crawler as $node) { try { $uris[] = new Uri($node->getAttribute('href'), $document->getUri()->toString()); } catch (UriSyntaxException $e) { $spider->getStatsHandler()->addToFailed($node->getAttribute('href'), 'Invalid URI: ' . $e->getMessage()); } } return $uris; }
/** * @covers VDB\Spider\Spider::crawl */ public function testCrawlFailedRequest() { $this->requestHandler->expects($this->any())->method('request')->will($this->throwException(new Exception('Failed mock request!'))); $this->spider->crawl(); $stats = $this->spider->getStatsHandler(); $this->assertCount(0, $stats->getFiltered(), 'Filtered count'); $this->assertCount(0, $stats->getQueued(), 'Queued count'); $this->assertCount(1, $stats->getFailed(), 'Failed count'); }
<?php use VDB\Spider\Discoverer\XPathExpressionDiscoverer; use VDB\Spider\Spider; require_once __DIR__ . '/../vendor/autoload.php'; // Create Spider $spider = new Spider('http://www.dmoz.org'); // Add a URI discoverer. Without it, the spider does nothing. In this case, we want <a> tags from a certain <div> $spider->addDiscoverer(new XPathExpressionDiscoverer("//div[@id='catalogs']//a")); // Set some sane options for this example. In this case, we only get the first 10 items from the start page. $spider->setMaxDepth(1); $spider->setMaxQueueSize(10); // Execute crawl $spider->crawl(); // Report $stats = $spider->getStatsHandler(); echo "\nSPIDER ID: " . $stats->getSpiderId(); echo "\n ENQUEUED: " . count($stats->getQueued()); echo "\n SKIPPED: " . count($stats->getFiltered()); echo "\n FAILED: " . count($stats->getFailed()); // Finally we could do some processing on the downloaded resources // In this example, we will echo the title of all resources echo "\n\nDOWNLOADED RESOURCES: "; foreach ($spider->getPersistenceHandler() as $resource) { echo "\n - " . $resource->getCrawler()->filterXpath('//title')->text(); }
use VDB\Spider\Filter\Prefetch\AllowedHostsFilter; use VDB\Spider\Filter\Prefetch\AllowedSchemeFilter; use VDB\Spider\Filter\Prefetch\UriWithHashFragmentFilter; use VDB\Spider\Filter\Prefetch\UriWithQueryStringFilter; use VDB\Spider\QueueManager\InMemoryQueueManager; use VDB\Spider\Spider; use VDB\Spider\StatsHandler; use VDB\Spider\LogHandler; use GuzzleHttp\Middleware; require_once 'example_complex_bootstrap.php'; // The URI we want to start crawling with $seed = 'http://www.dmoz.org/Computers/Internet/'; // We want to allow all subdomains of dmoz.org $allowSubDomains = true; // Create spider $spider = new Spider($seed); $spider->getDownloader()->setDownloadLimit(10); $statsHandler = new StatsHandler(); $LogHandler = new LogHandler(); $queueManager = new InMemoryQueueManager(); $queueManager->getDispatcher()->addSubscriber($statsHandler); $queueManager->getDispatcher()->addSubscriber($LogHandler); // Set some sane defaults for this example. We only visit the first level of www.dmoz.org. We stop at 10 queued resources $spider->getDiscovererSet()->maxDepth = 1; // This time, we set the traversal algorithm to breadth-first. The default is depth-first $queueManager->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); $spider->setQueueManager($queueManager); // We add an URI discoverer. Without it, the spider wouldn't get past the seed resource. $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//div[@class='dir-1 borN'][2]//a")); // Let's tell the spider to save all found resources on the filesystem $spider->getDownloader()->setPersistenceHandler(new \VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler(__DIR__ . '/results'));
/** * @covers VDB\Spider\Spider */ public function testCrawlFailedRequest() { $this->requestHandler->expects($this->any())->method('request')->will($this->throwException(new Exception('Failed mock request!'))); $this->spider->crawl(); $this->assertCount(0, $this->spider->getDownloader()->getPersistenceHandler(), 'Persisted count'); }
use Symfony\Component\EventDispatcher\Event; use VDB\Spider\Discoverer\XPathExpressionDiscoverer; use VDB\Spider\Event\SpiderEvents; use VDB\Spider\EventListener\PolitenessPolicyListener; use VDB\Spider\Filter\Prefetch\AllowedHostsFilter; use VDB\Spider\Filter\Prefetch\AllowedSchemeFilter; use VDB\Spider\Filter\Prefetch\UriWithHashFragmentFilter; use VDB\Spider\Filter\Prefetch\UriWithQueryStringFilter; use VDB\Spider\Spider; require_once 'example_complex_bootstrap.php'; // The URI we want to start crawling with $seed = 'http://www.dmoz.org/Computers/Internet/'; // We want to allow all subdomains of dmoz.org $allowSubDomains = true; // Create spider $spider = new Spider($seed); // Set some sane defaults for this example. We only visit the first level of www.dmoz.org. We stop at 10 queued resources $spider->setMaxDepth(1); $spider->setMaxQueueSize(10); // We add an URI discoverer. Without it, the spider wouldn't get past the seed resource. $spider->addDiscoverer(new XPathExpressionDiscoverer("//div[@class='dir-1 borN'][2]//a")); // Let's tell the spider to save all found resources on the filesystem $spider->setPersistenceHandler(new \VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler(__DIR__ . '/results')); // This time, we set the traversal algorithm to breadth-first. The default is depth-first $spider->setTraversalAlgorithm(Spider::ALGORITHM_BREADTH_FIRST); // Add some prefetch filters. These are executed before a resource is requested. // The more you have of these, the less HTTP requests and work for the processors $spider->addPreFetchFilter(new AllowedSchemeFilter(array('http'))); $spider->addPreFetchFilter(new AllowedHostsFilter(array($seed), $allowSubDomains)); $spider->addPreFetchFilter(new UriWithHashFragmentFilter()); $spider->addPreFetchFilter(new UriWithQueryStringFilter());
<?php use VDB\Spider\Discoverer\XPathExpressionDiscoverer; use Symfony\Component\EventDispatcher\Event; use VDB\Spider\Event\SpiderEvents; use VDB\Spider\StatsHandler; use VDB\Spider\Spider; require_once __DIR__ . '/../vendor/autoload.php'; // Create Spider $spider = new Spider('http://www.dmoz.org'); // Add a URI discoverer. Without it, the spider does nothing. In this case, we want <a> tags from a certain <div> $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//div[@id='catalogs']//a")); // Set some sane options for this example. In this case, we only get the first 10 items from the start page. $spider->getDiscovererSet()->maxDepth = 1; $spider->getQueueManager()->maxQueueSize = 10; // Let's add something to enable us to stop the script $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_USER_STOPPED, function (Event $event) { echo "\nCrawl aborted by user.\n"; exit; }); // Add a listener to collect stats to the Spider and the QueueMananger. // There are more components that dispatch events you can use. $statsHandler = new StatsHandler(); $spider->getQueueManager()->getDispatcher()->addSubscriber($statsHandler); $spider->getDispatcher()->addSubscriber($statsHandler); // Execute crawl $spider->crawl(); // Build a report echo "\n ENQUEUED: " . count($statsHandler->getQueued()); echo "\n SKIPPED: " . count($statsHandler->getFiltered()); echo "\n FAILED: " . count($statsHandler->getFailed());
/** * Start Parsing Urls! */ public function startParser() { $start = microtime(); $spider = new Spider($this->seed); if ($this->downloadLimit > 0) { $spider->getDownloader()->setDownloadLimit($this->downloadLimit); } $statsHandler = new StatsHandler(); $LogHandler = new Logger(\Pimcore::inDebugMode()); $queueManager = new InMemoryQueueManager(); $queueManager->getDispatcher()->addSubscriber($statsHandler); $queueManager->getDispatcher()->addSubscriber($LogHandler); $spider->getDiscovererSet()->maxDepth = $this->maxLinkDepth; $queueManager->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); $spider->setQueueManager($queueManager); $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//link[@hreflang]|//a[not(@rel='nofollow')]")); $spider->getDiscovererSet()->addFilter(new AllowedSchemeFilter($this->allowedSchemes)); $spider->getDiscovererSet()->addFilter(new AllowedHostsFilter(array($this->seed), $this->allowSubDomains)); $spider->getDiscovererSet()->addFilter(new UriWithHashFragmentFilter()); $spider->getDiscovererSet()->addFilter(new UriWithQueryStringFilter()); $spider->getDiscovererSet()->addFilter(new UriFilter($this->invalidLinkRegexes)); $spider->getDiscovererSet()->addFilter(new NegativeUriFilter($this->validLinkRegexes)); $politenessPolicyEventListener = new PolitenessPolicyListener(20); //CHANGE TO 100 !!!! $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($politenessPolicyEventListener, 'onCrawlPreRequest')); $spider->getDispatcher()->addSubscriber($statsHandler); $spider->getDispatcher()->addSubscriber($LogHandler); $abortListener = new Listener\Abort($spider); $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($abortListener, 'checkCrawlerState')); $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_USER_STOPPED, array($abortListener, 'stopCrawler')); if ($this->useAuth) { $authListener = new Listener\Auth($this->authUserName, $this->authPassword); $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($authListener, 'setAuth')); } $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, function (Event $event) { //echo 'crawling: ' . $event->getArgument('uri')->toString() . "\n"; }); // Execute the crawl $spider->crawl(); \Pimcore\Logger::debug("SPIDER ID: " . $statsHandler->getSpiderId()); \Pimcore\Logger::debug("SPIDER ID: " . $statsHandler->getSpiderId()); \Pimcore\Logger::debug("ENQUEUED: " . count($statsHandler->getQueued())); \Pimcore\Logger::debug("SKIPPED: " . count($statsHandler->getFiltered())); \Pimcore\Logger::debug("FAILED: " . count($statsHandler->getFailed())); \Pimcore\Logger::debug("PERSISTED: " . count($statsHandler->getPersisted())); $peakMem = round(memory_get_peak_usage(TRUE) / 1024 / 1024, 2); $totalTime = round(microtime(TRUE) - $start, 2); $totalDelay = round($politenessPolicyEventListener->totalDelay / 1000 / 1000, 2); \Pimcore\Logger::debug("PEAK MEM USAGE: " . $peakMem . 'MB'); \Pimcore\Logger::debug("TOTAL TIME: " . $totalTime . 's'); \Pimcore\Logger::debug("POLITENESS WAIT TIME: " . $totalDelay . 's'); $downloaded = $spider->getDownloader()->getPersistenceHandler(); //parse all resources! foreach ($downloaded as $resource) { $this->parseResponse($resource); } }