public function newLink($url, $domain) { $domain = new Domain($domain); $link = new Link($url); $link->setOriginDomain($domain); return $link; }
public function getLink(array $linkData, Domain $domain, Link $origin) { $deriver = new \webignition\AbsoluteUrlDeriver\AbsoluteUrlDeriver($linkData['href'], $origin->getLinkHref()); $link = new Link((string) $deriver->getAbsoluteUrl()); $link->setOriginDomain($domain); $link->setLinkText($linkData['text']); $link->setOrigin($origin); return $link; }
public function testIsLeavingOriginDomain_LinkHasNoHost() { $link = new Link('callto:1337donotcare"'); $link->setOriginDomain(new Domain('http://codebuster.de')); $this->assertFalse($link->isLeavingOriginDomain()); }
/** * Crawls the given domain. * * @param string $domain * @param string $startPoint * @return void */ public function crawl($domain, $startPoint = '/') { $process = new CrawlerProcess(); $process->start(); $domain = new Domain($domain, $startPoint); $this->domain = $domain; $this->dispatcher->dispatch(CrawlerEvents::onStart, new FilterCrawlerProcessEvent($process)); // create seed $link = new Link($domain->getUrl()); $link->setOriginDomain($domain); $this->pushLinkToQueue($link); // Start queue while ($process->isRunning() && ($link = $this->queue->pop()) !== false) { $this->dispatcher->dispatch(CrawlerEvents::onPopLinkFromQueue, new FilterLinkEvent($link, $process)); // download web page $response = $this->downloadPage($link, $process); $link->setResponse($response); if ($response !== null) { // fill the link with the data we get from the response $link->setStatusCode($response->getStatusCode()); $link->setPageTitle($this->finder->getTitle($link->getHtml())); // extract links from response and add them to queue $this->findLinksAndAddToQueue($link, $process); } $this->dispatcher->dispatch(CrawlerEvents::onLinkProcessed, new FilterLinkEvent($link, $process)); // so we don't dos the server sleep($this->wait_time); } $process->done(); $this->dispatcher->dispatch(CrawlerEvents::onFinish, new FilterCrawlerProcessEvent($process)); }