/** * Sets up the fixture, for example, opens a network connection. * This method is called before a test is executed. * * Setting up the following structure: * * 0: A * /|\ * 1: B C E * /| | | * 2: D F G | * | _ | * * Note: E links to F. */ protected function setUp() { $this->spider = new Spider('http://php-spider.org/A'); $this->requestHandler = $this->getMock('VDB\\Spider\\RequestHandler\\RequestHandlerInterface'); $this->hrefA = 'http://php-spider.org/A'; $this->hrefB = 'http://php-spider.org/B'; $this->hrefC = 'http://php-spider.org/C'; $this->hrefD = 'http://php-spider.org/D'; $this->hrefE = 'http://php-spider.org/E'; $this->hrefF = 'http://php-spider.org/F'; $this->hrefG = 'http://php-spider.org/G'; $this->linkA = new DiscoveredUri(new Uri($this->hrefA)); $this->linkB = new DiscoveredUri(new Uri($this->hrefB)); $this->linkC = new DiscoveredUri(new Uri($this->hrefC)); $this->linkD = new DiscoveredUri(new Uri($this->hrefD)); $this->linkE = new DiscoveredUri(new Uri($this->hrefE)); $this->linkF = new DiscoveredUri(new Uri($this->hrefF)); $this->linkG = new DiscoveredUri(new Uri($this->hrefG)); $this->linkA->setDepthFound(0); $this->linkB->setDepthFound(1); $this->linkC->setDepthFound(1); $this->linkD->setDepthFound(2); $this->linkE->setDepthFound(1); $this->linkF->setDepthFound(2); $this->linkG->setDepthFound(2); $htmlA = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceA.html'); $this->responseA = new Response(200, [], $htmlA); $htmlB = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceB.html'); $this->responseB = new Response(200, [], $htmlB); $htmlC = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceC.html'); $this->responseC = new Response(200, [], $htmlC); $htmlD = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceD.html'); $this->responseD = new Response(200, [], $htmlD); $htmlE = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceE.html'); $this->responseE = new Response(200, [], $htmlE); $htmlF = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceF.html'); $this->responseF = new Response(200, [], $htmlF); $htmlG = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceG.html'); $this->responseG = new Response(200, [], $htmlG); $this->linkToResponseMap[$this->linkA->toString()] = $this->responseA; $this->linkToResponseMap[$this->linkB->toString()] = $this->responseB; $this->linkToResponseMap[$this->linkC->toString()] = $this->responseC; $this->linkToResponseMap[$this->linkD->toString()] = $this->responseD; $this->linkToResponseMap[$this->linkE->toString()] = $this->responseE; $this->linkToResponseMap[$this->linkF->toString()] = $this->responseF; $this->linkToResponseMap[$this->linkG->toString()] = $this->responseG; $this->requestHandler->expects($this->any())->method('request')->will($this->returnCallback(array($this, 'doTestRequest'))); $this->spider->getDownloader()->setRequestHandler($this->requestHandler); $this->spider->getDiscovererSet()->set(new XPathExpressionDiscoverer('//a')); $this->statsHandler = new StatsHandler(); $this->spider->getDispatcher()->addSubscriber($this->statsHandler); $this->spider->getQueueManager()->getDispatcher()->addSubscriber($this->statsHandler); $this->spider->getDownloader()->getDispatcher()->addSubscriber($this->statsHandler); $this->logHandler = new LogHandler(); $this->spider->getDispatcher()->addSubscriber($this->logHandler); $this->spider->getQueueManager()->getDispatcher()->addSubscriber($this->logHandler); $this->spider->getDownloader()->getDispatcher()->addSubscriber($this->logHandler); }
$queueManager->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); $spider->setQueueManager($queueManager); // We add an URI discoverer. Without it, the spider wouldn't get past the seed resource. $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//div[@class='dir-1 borN'][2]//a")); // Let's tell the spider to save all found resources on the filesystem $spider->getDownloader()->setPersistenceHandler(new \VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler(__DIR__ . '/results')); // Add some prefetch filters. These are executed before a resource is requested. // The more you have of these, the less HTTP requests and work for the processors $spider->getDiscovererSet()->addFilter(new AllowedSchemeFilter(array('http'))); $spider->getDiscovererSet()->addFilter(new AllowedHostsFilter(array($seed), $allowSubDomains)); $spider->getDiscovererSet()->addFilter(new UriWithHashFragmentFilter()); $spider->getDiscovererSet()->addFilter(new UriWithQueryStringFilter()); // We add an eventlistener to the crawler that implements a politeness policy. We wait 450ms between every request to the same domain $politenessPolicyEventListener = new PolitenessPolicyListener(100); $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($politenessPolicyEventListener, 'onCrawlPreRequest')); $spider->getDispatcher()->addSubscriber($statsHandler); $spider->getDispatcher()->addSubscriber($LogHandler); // Let's add something to enable us to stop the script $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_USER_STOPPED, function (Event $event) { echo "\nCrawl aborted by user.\n"; exit; }); // Let's add a CLI progress meter for fun echo "\nCrawling"; $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, function (Event $event) { echo '.'; }); // Set up some caching, logging and profiling on the HTTP client of the spider $guzzleClient = $spider->getDownloader()->getRequestHandler()->getClient(); $tapMiddleware = Middleware::tap([$timerMiddleware, 'onRequest'], [$timerMiddleware, 'onResponse']); $guzzleClient->getConfig('handler')->push($tapMiddleware, 'timer');
$spider->setMaxQueueSize(10); // We add an URI discoverer. Without it, the spider wouldn't get past the seed resource. $spider->addDiscoverer(new XPathExpressionDiscoverer("//div[@class='dir-1 borN'][2]//a")); // Let's tell the spider to save all found resources on the filesystem $spider->setPersistenceHandler(new \VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler(__DIR__ . '/results')); // This time, we set the traversal algorithm to breadth-first. The default is depth-first $spider->setTraversalAlgorithm(Spider::ALGORITHM_BREADTH_FIRST); // Add some prefetch filters. These are executed before a resource is requested. // The more you have of these, the less HTTP requests and work for the processors $spider->addPreFetchFilter(new AllowedSchemeFilter(array('http'))); $spider->addPreFetchFilter(new AllowedHostsFilter(array($seed), $allowSubDomains)); $spider->addPreFetchFilter(new UriWithHashFragmentFilter()); $spider->addPreFetchFilter(new UriWithQueryStringFilter()); // We add an eventlistener to the crawler that implements a politeness policy. We wait 450ms between every request to the same domain $politenessPolicyEventListener = new PolitenessPolicyListener(450); $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($politenessPolicyEventListener, 'onCrawlPreRequest')); // Let's add a CLI progress meter for fun echo "\nCrawling"; $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_ENQUEUE, function (Event $event) { echo '.'; }); //// Set up some caching, logging and profiling on the HTTP client of the spider $guzzleClient = $spider->getRequestHandler()->getClient(); $guzzleClient->addSubscriber($logPlugin); $guzzleClient->addSubscriber($timerPlugin); $guzzleClient->addSubscriber($cachePlugin); // Set the user agent $guzzleClient->setUserAgent('PHP-Spider'); // Execute the crawl $result = $spider->crawl(); // Report
/** * Start Parsing Urls! */ public function startParser() { $start = microtime(); $spider = new Spider($this->seed); if ($this->downloadLimit > 0) { $spider->getDownloader()->setDownloadLimit($this->downloadLimit); } $statsHandler = new StatsHandler(); $LogHandler = new Logger(\Pimcore::inDebugMode()); $queueManager = new InMemoryQueueManager(); $queueManager->getDispatcher()->addSubscriber($statsHandler); $queueManager->getDispatcher()->addSubscriber($LogHandler); $spider->getDiscovererSet()->maxDepth = $this->maxLinkDepth; $queueManager->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); $spider->setQueueManager($queueManager); $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//link[@hreflang]|//a[not(@rel='nofollow')]")); $spider->getDiscovererSet()->addFilter(new AllowedSchemeFilter($this->allowedSchemes)); $spider->getDiscovererSet()->addFilter(new AllowedHostsFilter(array($this->seed), $this->allowSubDomains)); $spider->getDiscovererSet()->addFilter(new UriWithHashFragmentFilter()); $spider->getDiscovererSet()->addFilter(new UriWithQueryStringFilter()); $spider->getDiscovererSet()->addFilter(new UriFilter($this->invalidLinkRegexes)); $spider->getDiscovererSet()->addFilter(new NegativeUriFilter($this->validLinkRegexes)); $politenessPolicyEventListener = new PolitenessPolicyListener(20); //CHANGE TO 100 !!!! $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($politenessPolicyEventListener, 'onCrawlPreRequest')); $spider->getDispatcher()->addSubscriber($statsHandler); $spider->getDispatcher()->addSubscriber($LogHandler); $abortListener = new Listener\Abort($spider); $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($abortListener, 'checkCrawlerState')); $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_USER_STOPPED, array($abortListener, 'stopCrawler')); if ($this->useAuth) { $authListener = new Listener\Auth($this->authUserName, $this->authPassword); $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($authListener, 'setAuth')); } $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, function (Event $event) { //echo 'crawling: ' . $event->getArgument('uri')->toString() . "\n"; }); // Execute the crawl $spider->crawl(); \Pimcore\Logger::debug("SPIDER ID: " . $statsHandler->getSpiderId()); \Pimcore\Logger::debug("SPIDER ID: " . $statsHandler->getSpiderId()); \Pimcore\Logger::debug("ENQUEUED: " . count($statsHandler->getQueued())); \Pimcore\Logger::debug("SKIPPED: " . count($statsHandler->getFiltered())); \Pimcore\Logger::debug("FAILED: " . count($statsHandler->getFailed())); \Pimcore\Logger::debug("PERSISTED: " . count($statsHandler->getPersisted())); $peakMem = round(memory_get_peak_usage(TRUE) / 1024 / 1024, 2); $totalTime = round(microtime(TRUE) - $start, 2); $totalDelay = round($politenessPolicyEventListener->totalDelay / 1000 / 1000, 2); \Pimcore\Logger::debug("PEAK MEM USAGE: " . $peakMem . 'MB'); \Pimcore\Logger::debug("TOTAL TIME: " . $totalTime . 's'); \Pimcore\Logger::debug("POLITENESS WAIT TIME: " . $totalDelay . 's'); $downloaded = $spider->getDownloader()->getPersistenceHandler(); //parse all resources! foreach ($downloaded as $resource) { $this->parseResponse($resource); } }