$seed = 'http://www.dmoz.org/Computers/Internet/'; // We want to allow all subdomains of dmoz.org $allowSubDomains = true; // Create spider $spider = new Spider($seed); $spider->getDownloader()->setDownloadLimit(10); $statsHandler = new StatsHandler(); $LogHandler = new LogHandler(); $queueManager = new InMemoryQueueManager(); $queueManager->getDispatcher()->addSubscriber($statsHandler); $queueManager->getDispatcher()->addSubscriber($LogHandler); // Set some sane defaults for this example. We only visit the first level of www.dmoz.org. We stop at 10 queued resources $spider->getDiscovererSet()->maxDepth = 1; // This time, we set the traversal algorithm to breadth-first. The default is depth-first $queueManager->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); $spider->setQueueManager($queueManager); // We add an URI discoverer. Without it, the spider wouldn't get past the seed resource. $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//div[@class='dir-1 borN'][2]//a")); // Let's tell the spider to save all found resources on the filesystem $spider->getDownloader()->setPersistenceHandler(new \VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler(__DIR__ . '/results')); // Add some prefetch filters. These are executed before a resource is requested. // The more you have of these, the less HTTP requests and work for the processors $spider->getDiscovererSet()->addFilter(new AllowedSchemeFilter(array('http'))); $spider->getDiscovererSet()->addFilter(new AllowedHostsFilter(array($seed), $allowSubDomains)); $spider->getDiscovererSet()->addFilter(new UriWithHashFragmentFilter()); $spider->getDiscovererSet()->addFilter(new UriWithQueryStringFilter()); // We add an eventlistener to the crawler that implements a politeness policy. We wait 450ms between every request to the same domain $politenessPolicyEventListener = new PolitenessPolicyListener(100); $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($politenessPolicyEventListener, 'onCrawlPreRequest')); $spider->getDispatcher()->addSubscriber($statsHandler); $spider->getDispatcher()->addSubscriber($LogHandler);
/** * Start Parsing Urls! */ public function startParser() { $start = microtime(); $spider = new Spider($this->seed); if ($this->downloadLimit > 0) { $spider->getDownloader()->setDownloadLimit($this->downloadLimit); } $statsHandler = new StatsHandler(); $LogHandler = new Logger(\Pimcore::inDebugMode()); $queueManager = new InMemoryQueueManager(); $queueManager->getDispatcher()->addSubscriber($statsHandler); $queueManager->getDispatcher()->addSubscriber($LogHandler); $spider->getDiscovererSet()->maxDepth = $this->maxLinkDepth; $queueManager->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); $spider->setQueueManager($queueManager); $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//link[@hreflang]|//a[not(@rel='nofollow')]")); $spider->getDiscovererSet()->addFilter(new AllowedSchemeFilter($this->allowedSchemes)); $spider->getDiscovererSet()->addFilter(new AllowedHostsFilter(array($this->seed), $this->allowSubDomains)); $spider->getDiscovererSet()->addFilter(new UriWithHashFragmentFilter()); $spider->getDiscovererSet()->addFilter(new UriWithQueryStringFilter()); $spider->getDiscovererSet()->addFilter(new UriFilter($this->invalidLinkRegexes)); $spider->getDiscovererSet()->addFilter(new NegativeUriFilter($this->validLinkRegexes)); $politenessPolicyEventListener = new PolitenessPolicyListener(20); //CHANGE TO 100 !!!! $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($politenessPolicyEventListener, 'onCrawlPreRequest')); $spider->getDispatcher()->addSubscriber($statsHandler); $spider->getDispatcher()->addSubscriber($LogHandler); $abortListener = new Listener\Abort($spider); $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($abortListener, 'checkCrawlerState')); $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_USER_STOPPED, array($abortListener, 'stopCrawler')); if ($this->useAuth) { $authListener = new Listener\Auth($this->authUserName, $this->authPassword); $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($authListener, 'setAuth')); } $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, function (Event $event) { //echo 'crawling: ' . $event->getArgument('uri')->toString() . "\n"; }); // Execute the crawl $spider->crawl(); \Pimcore\Logger::debug("SPIDER ID: " . $statsHandler->getSpiderId()); \Pimcore\Logger::debug("SPIDER ID: " . $statsHandler->getSpiderId()); \Pimcore\Logger::debug("ENQUEUED: " . count($statsHandler->getQueued())); \Pimcore\Logger::debug("SKIPPED: " . count($statsHandler->getFiltered())); \Pimcore\Logger::debug("FAILED: " . count($statsHandler->getFailed())); \Pimcore\Logger::debug("PERSISTED: " . count($statsHandler->getPersisted())); $peakMem = round(memory_get_peak_usage(TRUE) / 1024 / 1024, 2); $totalTime = round(microtime(TRUE) - $start, 2); $totalDelay = round($politenessPolicyEventListener->totalDelay / 1000 / 1000, 2); \Pimcore\Logger::debug("PEAK MEM USAGE: " . $peakMem . 'MB'); \Pimcore\Logger::debug("TOTAL TIME: " . $totalTime . 's'); \Pimcore\Logger::debug("POLITENESS WAIT TIME: " . $totalDelay . 's'); $downloaded = $spider->getDownloader()->getPersistenceHandler(); //parse all resources! foreach ($downloaded as $resource) { $this->parseResponse($resource); } }