예제 #1
0
 public function newLink($url, $domain)
 {
     $domain = new Domain($domain);
     $link = new Link($url);
     $link->setOriginDomain($domain);
     return $link;
 }
예제 #2
0
 public function getLink(array $linkData, Domain $domain, Link $origin)
 {
     $deriver = new \webignition\AbsoluteUrlDeriver\AbsoluteUrlDeriver($linkData['href'], $origin->getLinkHref());
     $link = new Link((string) $deriver->getAbsoluteUrl());
     $link->setOriginDomain($domain);
     $link->setLinkText($linkData['text']);
     $link->setOrigin($origin);
     return $link;
 }
예제 #3
0
 public function testIsLeavingOriginDomain_LinkHasNoHost()
 {
     $link = new Link('callto:1337donotcare"');
     $link->setOriginDomain(new Domain('http://codebuster.de'));
     $this->assertFalse($link->isLeavingOriginDomain());
 }
예제 #4
0
 /**
  * Crawls the given domain.
  *
  * @param  string $domain
  * @param  string $startPoint
  * @return void
  */
 public function crawl($domain, $startPoint = '/')
 {
     $process = new CrawlerProcess();
     $process->start();
     $domain = new Domain($domain, $startPoint);
     $this->domain = $domain;
     $this->dispatcher->dispatch(CrawlerEvents::onStart, new FilterCrawlerProcessEvent($process));
     // create seed
     $link = new Link($domain->getUrl());
     $link->setOriginDomain($domain);
     $this->pushLinkToQueue($link);
     // Start queue
     while ($process->isRunning() && ($link = $this->queue->pop()) !== false) {
         $this->dispatcher->dispatch(CrawlerEvents::onPopLinkFromQueue, new FilterLinkEvent($link, $process));
         // download web page
         $response = $this->downloadPage($link, $process);
         $link->setResponse($response);
         if ($response !== null) {
             // fill the link with the data we get from the response
             $link->setStatusCode($response->getStatusCode());
             $link->setPageTitle($this->finder->getTitle($link->getHtml()));
             // extract links from response and add them to queue
             $this->findLinksAndAddToQueue($link, $process);
         }
         $this->dispatcher->dispatch(CrawlerEvents::onLinkProcessed, new FilterLinkEvent($link, $process));
         // so we don't dos the server
         sleep($this->wait_time);
     }
     $process->done();
     $this->dispatcher->dispatch(CrawlerEvents::onFinish, new FilterCrawlerProcessEvent($process));
 }