Exemple #1
0
use VDB\Spider\QueueManager\InMemoryQueueManager;
use VDB\Spider\Spider;
use VDB\Spider\StatsHandler;
use VDB\Spider\LogHandler;
use GuzzleHttp\Middleware;
require_once 'example_complex_bootstrap.php';
// The URI we want to start crawling with
$seed = 'http://www.dmoz.org/Computers/Internet/';
// We want to allow all subdomains of dmoz.org
$allowSubDomains = true;
// Create spider
$spider = new Spider($seed);
$spider->getDownloader()->setDownloadLimit(10);
$statsHandler = new StatsHandler();
$LogHandler = new LogHandler();
$queueManager = new InMemoryQueueManager();
$queueManager->getDispatcher()->addSubscriber($statsHandler);
$queueManager->getDispatcher()->addSubscriber($LogHandler);
// Set some sane defaults for this example. We only visit the first level of www.dmoz.org. We stop at 10 queued resources
$spider->getDiscovererSet()->maxDepth = 1;
// This time, we set the traversal algorithm to breadth-first. The default is depth-first
$queueManager->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST);
$spider->setQueueManager($queueManager);
// We add an URI discoverer. Without it, the spider wouldn't get past the seed resource.
$spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//div[@class='dir-1 borN'][2]//a"));
// Let's tell the spider to save all found resources on the filesystem
$spider->getDownloader()->setPersistenceHandler(new \VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler(__DIR__ . '/results'));
// Add some prefetch filters. These are executed before a resource is requested.
// The more you have of these, the less HTTP requests and work for the processors
$spider->getDiscovererSet()->addFilter(new AllowedSchemeFilter(array('http')));
$spider->getDiscovererSet()->addFilter(new AllowedHostsFilter(array($seed), $allowSubDomains));
 /**
  * Start Parsing Urls!
  */
 public function startParser()
 {
     $start = microtime();
     $spider = new Spider($this->seed);
     if ($this->downloadLimit > 0) {
         $spider->getDownloader()->setDownloadLimit($this->downloadLimit);
     }
     $statsHandler = new StatsHandler();
     $LogHandler = new Logger(\Pimcore::inDebugMode());
     $queueManager = new InMemoryQueueManager();
     $queueManager->getDispatcher()->addSubscriber($statsHandler);
     $queueManager->getDispatcher()->addSubscriber($LogHandler);
     $spider->getDiscovererSet()->maxDepth = $this->maxLinkDepth;
     $queueManager->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST);
     $spider->setQueueManager($queueManager);
     $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//link[@hreflang]|//a[not(@rel='nofollow')]"));
     $spider->getDiscovererSet()->addFilter(new AllowedSchemeFilter($this->allowedSchemes));
     $spider->getDiscovererSet()->addFilter(new AllowedHostsFilter(array($this->seed), $this->allowSubDomains));
     $spider->getDiscovererSet()->addFilter(new UriWithHashFragmentFilter());
     $spider->getDiscovererSet()->addFilter(new UriWithQueryStringFilter());
     $spider->getDiscovererSet()->addFilter(new UriFilter($this->invalidLinkRegexes));
     $spider->getDiscovererSet()->addFilter(new NegativeUriFilter($this->validLinkRegexes));
     $politenessPolicyEventListener = new PolitenessPolicyListener(20);
     //CHANGE TO 100 !!!!
     $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($politenessPolicyEventListener, 'onCrawlPreRequest'));
     $spider->getDispatcher()->addSubscriber($statsHandler);
     $spider->getDispatcher()->addSubscriber($LogHandler);
     $abortListener = new Listener\Abort($spider);
     $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($abortListener, 'checkCrawlerState'));
     $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_USER_STOPPED, array($abortListener, 'stopCrawler'));
     if ($this->useAuth) {
         $authListener = new Listener\Auth($this->authUserName, $this->authPassword);
         $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($authListener, 'setAuth'));
     }
     $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, function (Event $event) {
         //echo 'crawling: ' . $event->getArgument('uri')->toString() . "\n";
     });
     // Execute the crawl
     $spider->crawl();
     \Pimcore\Logger::debug("SPIDER ID: " . $statsHandler->getSpiderId());
     \Pimcore\Logger::debug("SPIDER ID: " . $statsHandler->getSpiderId());
     \Pimcore\Logger::debug("ENQUEUED:  " . count($statsHandler->getQueued()));
     \Pimcore\Logger::debug("SKIPPED:   " . count($statsHandler->getFiltered()));
     \Pimcore\Logger::debug("FAILED:    " . count($statsHandler->getFailed()));
     \Pimcore\Logger::debug("PERSISTED: " . count($statsHandler->getPersisted()));
     $peakMem = round(memory_get_peak_usage(TRUE) / 1024 / 1024, 2);
     $totalTime = round(microtime(TRUE) - $start, 2);
     $totalDelay = round($politenessPolicyEventListener->totalDelay / 1000 / 1000, 2);
     \Pimcore\Logger::debug("PEAK MEM USAGE:       " . $peakMem . 'MB');
     \Pimcore\Logger::debug("TOTAL TIME:           " . $totalTime . 's');
     \Pimcore\Logger::debug("POLITENESS WAIT TIME: " . $totalDelay . 's');
     $downloaded = $spider->getDownloader()->getPersistenceHandler();
     //parse all resources!
     foreach ($downloaded as $resource) {
         $this->parseResponse($resource);
     }
 }