Exemple #1
0
 public function testGetUrl()
 {
     $domain = new Domain('codebuster.de');
     $this->assertEquals('http://codebuster.de/', $domain->getUrl());
     $domain = new Domain('http://codebuster.de');
     $this->assertEquals('http://codebuster.de/', $domain->getUrl());
 }
 /**
  * Crawls the given domain.
  *
  * @param  string $domain
  * @param  string $startPoint
  * @return void
  */
 public function crawl($domain, $startPoint = '/')
 {
     $process = new CrawlerProcess();
     $process->start();
     $domain = new Domain($domain, $startPoint);
     $this->domain = $domain;
     $this->dispatcher->dispatch(CrawlerEvents::onStart, new FilterCrawlerProcessEvent($process));
     // create seed
     $link = new Link($domain->getUrl());
     $link->setOriginDomain($domain);
     $this->pushLinkToQueue($link);
     // Start queue
     while ($process->isRunning() && ($link = $this->queue->pop()) !== false) {
         $this->dispatcher->dispatch(CrawlerEvents::onPopLinkFromQueue, new FilterLinkEvent($link, $process));
         // download web page
         $response = $this->downloadPage($link, $process);
         $link->setResponse($response);
         if ($response !== null) {
             // fill the link with the data we get from the response
             $link->setStatusCode($response->getStatusCode());
             $link->setPageTitle($this->finder->getTitle($link->getHtml()));
             // extract links from response and add them to queue
             $this->findLinksAndAddToQueue($link, $process);
         }
         $this->dispatcher->dispatch(CrawlerEvents::onLinkProcessed, new FilterLinkEvent($link, $process));
         // so we don't dos the server
         sleep($this->wait_time);
     }
     $process->done();
     $this->dispatcher->dispatch(CrawlerEvents::onFinish, new FilterCrawlerProcessEvent($process));
 }