public function testGetUrl() { $domain = new Domain('codebuster.de'); $this->assertEquals('http://codebuster.de/', $domain->getUrl()); $domain = new Domain('http://codebuster.de'); $this->assertEquals('http://codebuster.de/', $domain->getUrl()); }
/** * Crawls the given domain. * * @param string $domain * @param string $startPoint * @return void */ public function crawl($domain, $startPoint = '/') { $process = new CrawlerProcess(); $process->start(); $domain = new Domain($domain, $startPoint); $this->domain = $domain; $this->dispatcher->dispatch(CrawlerEvents::onStart, new FilterCrawlerProcessEvent($process)); // create seed $link = new Link($domain->getUrl()); $link->setOriginDomain($domain); $this->pushLinkToQueue($link); // Start queue while ($process->isRunning() && ($link = $this->queue->pop()) !== false) { $this->dispatcher->dispatch(CrawlerEvents::onPopLinkFromQueue, new FilterLinkEvent($link, $process)); // download web page $response = $this->downloadPage($link, $process); $link->setResponse($response); if ($response !== null) { // fill the link with the data we get from the response $link->setStatusCode($response->getStatusCode()); $link->setPageTitle($this->finder->getTitle($link->getHtml())); // extract links from response and add them to queue $this->findLinksAndAddToQueue($link, $process); } $this->dispatcher->dispatch(CrawlerEvents::onLinkProcessed, new FilterLinkEvent($link, $process)); // so we don't dos the server sleep($this->wait_time); } $process->done(); $this->dispatcher->dispatch(CrawlerEvents::onFinish, new FilterCrawlerProcessEvent($process)); }