示例#1
0
 /**
  * Sets up the fixture, for example, opens a network connection.
  * This method is called before a test is executed.
  *
  * Setting up the following structure:
  *
  * 0:        A
  *          /|\
  * 1:      B C E
  *        /| | |
  * 2:    D F G |
  *         | _ |
  *
  * Note: E links to F.
  */
 protected function setUp()
 {
     $this->spider = new Spider('http://php-spider.org/A');
     $this->requestHandler = $this->getMock('VDB\\Spider\\RequestHandler\\RequestHandlerInterface');
     $this->hrefA = 'http://php-spider.org/A';
     $this->hrefB = 'http://php-spider.org/B';
     $this->hrefC = 'http://php-spider.org/C';
     $this->hrefD = 'http://php-spider.org/D';
     $this->hrefE = 'http://php-spider.org/E';
     $this->hrefF = 'http://php-spider.org/F';
     $this->hrefG = 'http://php-spider.org/G';
     $this->linkA = new DiscoveredUri(new Uri($this->hrefA));
     $this->linkB = new DiscoveredUri(new Uri($this->hrefB));
     $this->linkC = new DiscoveredUri(new Uri($this->hrefC));
     $this->linkD = new DiscoveredUri(new Uri($this->hrefD));
     $this->linkE = new DiscoveredUri(new Uri($this->hrefE));
     $this->linkF = new DiscoveredUri(new Uri($this->hrefF));
     $this->linkG = new DiscoveredUri(new Uri($this->hrefG));
     $this->linkA->setDepthFound(0);
     $this->linkB->setDepthFound(1);
     $this->linkC->setDepthFound(1);
     $this->linkD->setDepthFound(2);
     $this->linkE->setDepthFound(1);
     $this->linkF->setDepthFound(2);
     $this->linkG->setDepthFound(2);
     $htmlA = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceA.html');
     $this->responseA = new Response(200, [], $htmlA);
     $htmlB = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceB.html');
     $this->responseB = new Response(200, [], $htmlB);
     $htmlC = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceC.html');
     $this->responseC = new Response(200, [], $htmlC);
     $htmlD = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceD.html');
     $this->responseD = new Response(200, [], $htmlD);
     $htmlE = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceE.html');
     $this->responseE = new Response(200, [], $htmlE);
     $htmlF = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceF.html');
     $this->responseF = new Response(200, [], $htmlF);
     $htmlG = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceG.html');
     $this->responseG = new Response(200, [], $htmlG);
     $this->linkToResponseMap[$this->linkA->toString()] = $this->responseA;
     $this->linkToResponseMap[$this->linkB->toString()] = $this->responseB;
     $this->linkToResponseMap[$this->linkC->toString()] = $this->responseC;
     $this->linkToResponseMap[$this->linkD->toString()] = $this->responseD;
     $this->linkToResponseMap[$this->linkE->toString()] = $this->responseE;
     $this->linkToResponseMap[$this->linkF->toString()] = $this->responseF;
     $this->linkToResponseMap[$this->linkG->toString()] = $this->responseG;
     $this->requestHandler->expects($this->any())->method('request')->will($this->returnCallback(array($this, 'doTestRequest')));
     $this->spider->getDownloader()->setRequestHandler($this->requestHandler);
     $this->spider->getDiscovererSet()->set(new XPathExpressionDiscoverer('//a'));
     $this->statsHandler = new StatsHandler();
     $this->spider->getDispatcher()->addSubscriber($this->statsHandler);
     $this->spider->getQueueManager()->getDispatcher()->addSubscriber($this->statsHandler);
     $this->spider->getDownloader()->getDispatcher()->addSubscriber($this->statsHandler);
     $this->logHandler = new LogHandler();
     $this->spider->getDispatcher()->addSubscriber($this->logHandler);
     $this->spider->getQueueManager()->getDispatcher()->addSubscriber($this->logHandler);
     $this->spider->getDownloader()->getDispatcher()->addSubscriber($this->logHandler);
 }
示例#2
0
$queueManager->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST);
$spider->setQueueManager($queueManager);
// We add an URI discoverer. Without it, the spider wouldn't get past the seed resource.
$spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//div[@class='dir-1 borN'][2]//a"));
// Let's tell the spider to save all found resources on the filesystem
$spider->getDownloader()->setPersistenceHandler(new \VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler(__DIR__ . '/results'));
// Add some prefetch filters. These are executed before a resource is requested.
// The more you have of these, the less HTTP requests and work for the processors
$spider->getDiscovererSet()->addFilter(new AllowedSchemeFilter(array('http')));
$spider->getDiscovererSet()->addFilter(new AllowedHostsFilter(array($seed), $allowSubDomains));
$spider->getDiscovererSet()->addFilter(new UriWithHashFragmentFilter());
$spider->getDiscovererSet()->addFilter(new UriWithQueryStringFilter());
// We add an eventlistener to the crawler that implements a politeness policy. We wait 450ms between every request to the same domain
$politenessPolicyEventListener = new PolitenessPolicyListener(100);
$spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($politenessPolicyEventListener, 'onCrawlPreRequest'));
$spider->getDispatcher()->addSubscriber($statsHandler);
$spider->getDispatcher()->addSubscriber($LogHandler);
// Let's add something to enable us to stop the script
$spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_USER_STOPPED, function (Event $event) {
    echo "\nCrawl aborted by user.\n";
    exit;
});
// Let's add a CLI progress meter for fun
echo "\nCrawling";
$spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, function (Event $event) {
    echo '.';
});
// Set up some caching, logging and profiling on the HTTP client of the spider
$guzzleClient = $spider->getDownloader()->getRequestHandler()->getClient();
$tapMiddleware = Middleware::tap([$timerMiddleware, 'onRequest'], [$timerMiddleware, 'onResponse']);
$guzzleClient->getConfig('handler')->push($tapMiddleware, 'timer');
示例#3
0
$spider->setMaxQueueSize(10);
// We add an URI discoverer. Without it, the spider wouldn't get past the seed resource.
$spider->addDiscoverer(new XPathExpressionDiscoverer("//div[@class='dir-1 borN'][2]//a"));
// Let's tell the spider to save all found resources on the filesystem
$spider->setPersistenceHandler(new \VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler(__DIR__ . '/results'));
// This time, we set the traversal algorithm to breadth-first. The default is depth-first
$spider->setTraversalAlgorithm(Spider::ALGORITHM_BREADTH_FIRST);
// Add some prefetch filters. These are executed before a resource is requested.
// The more you have of these, the less HTTP requests and work for the processors
$spider->addPreFetchFilter(new AllowedSchemeFilter(array('http')));
$spider->addPreFetchFilter(new AllowedHostsFilter(array($seed), $allowSubDomains));
$spider->addPreFetchFilter(new UriWithHashFragmentFilter());
$spider->addPreFetchFilter(new UriWithQueryStringFilter());
// We add an eventlistener to the crawler that implements a politeness policy. We wait 450ms between every request to the same domain
$politenessPolicyEventListener = new PolitenessPolicyListener(450);
$spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($politenessPolicyEventListener, 'onCrawlPreRequest'));
// Let's add a CLI progress meter for fun
echo "\nCrawling";
$spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_ENQUEUE, function (Event $event) {
    echo '.';
});
//// Set up some caching, logging and profiling on the HTTP client of the spider
$guzzleClient = $spider->getRequestHandler()->getClient();
$guzzleClient->addSubscriber($logPlugin);
$guzzleClient->addSubscriber($timerPlugin);
$guzzleClient->addSubscriber($cachePlugin);
// Set the user agent
$guzzleClient->setUserAgent('PHP-Spider');
// Execute the crawl
$result = $spider->crawl();
// Report
 /**
  * Start Parsing Urls!
  */
 public function startParser()
 {
     $start = microtime();
     $spider = new Spider($this->seed);
     if ($this->downloadLimit > 0) {
         $spider->getDownloader()->setDownloadLimit($this->downloadLimit);
     }
     $statsHandler = new StatsHandler();
     $LogHandler = new Logger(\Pimcore::inDebugMode());
     $queueManager = new InMemoryQueueManager();
     $queueManager->getDispatcher()->addSubscriber($statsHandler);
     $queueManager->getDispatcher()->addSubscriber($LogHandler);
     $spider->getDiscovererSet()->maxDepth = $this->maxLinkDepth;
     $queueManager->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST);
     $spider->setQueueManager($queueManager);
     $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//link[@hreflang]|//a[not(@rel='nofollow')]"));
     $spider->getDiscovererSet()->addFilter(new AllowedSchemeFilter($this->allowedSchemes));
     $spider->getDiscovererSet()->addFilter(new AllowedHostsFilter(array($this->seed), $this->allowSubDomains));
     $spider->getDiscovererSet()->addFilter(new UriWithHashFragmentFilter());
     $spider->getDiscovererSet()->addFilter(new UriWithQueryStringFilter());
     $spider->getDiscovererSet()->addFilter(new UriFilter($this->invalidLinkRegexes));
     $spider->getDiscovererSet()->addFilter(new NegativeUriFilter($this->validLinkRegexes));
     $politenessPolicyEventListener = new PolitenessPolicyListener(20);
     //CHANGE TO 100 !!!!
     $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($politenessPolicyEventListener, 'onCrawlPreRequest'));
     $spider->getDispatcher()->addSubscriber($statsHandler);
     $spider->getDispatcher()->addSubscriber($LogHandler);
     $abortListener = new Listener\Abort($spider);
     $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($abortListener, 'checkCrawlerState'));
     $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_USER_STOPPED, array($abortListener, 'stopCrawler'));
     if ($this->useAuth) {
         $authListener = new Listener\Auth($this->authUserName, $this->authPassword);
         $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($authListener, 'setAuth'));
     }
     $spider->getDownloader()->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, function (Event $event) {
         //echo 'crawling: ' . $event->getArgument('uri')->toString() . "\n";
     });
     // Execute the crawl
     $spider->crawl();
     \Pimcore\Logger::debug("SPIDER ID: " . $statsHandler->getSpiderId());
     \Pimcore\Logger::debug("SPIDER ID: " . $statsHandler->getSpiderId());
     \Pimcore\Logger::debug("ENQUEUED:  " . count($statsHandler->getQueued()));
     \Pimcore\Logger::debug("SKIPPED:   " . count($statsHandler->getFiltered()));
     \Pimcore\Logger::debug("FAILED:    " . count($statsHandler->getFailed()));
     \Pimcore\Logger::debug("PERSISTED: " . count($statsHandler->getPersisted()));
     $peakMem = round(memory_get_peak_usage(TRUE) / 1024 / 1024, 2);
     $totalTime = round(microtime(TRUE) - $start, 2);
     $totalDelay = round($politenessPolicyEventListener->totalDelay / 1000 / 1000, 2);
     \Pimcore\Logger::debug("PEAK MEM USAGE:       " . $peakMem . 'MB');
     \Pimcore\Logger::debug("TOTAL TIME:           " . $totalTime . 's');
     \Pimcore\Logger::debug("POLITENESS WAIT TIME: " . $totalDelay . 's');
     $downloaded = $spider->getDownloader()->getPersistenceHandler();
     //parse all resources!
     foreach ($downloaded as $resource) {
         $this->parseResponse($resource);
     }
 }