/** * Scan driver can decide about correct input data. If scan driver detect * not valid input source, scanning can be refused. * @return bool */ public function validate() { foreach ($this->ignoredExtensions as $ext) { if (Strings::endsWith(strtolower($this->url), $ext)) { return false; } } return true; }
/** * Scan one url for links ands update scanned and not scanned links. * @param IScanDriver $site * @param $startPage */ protected function scanUrl(IScanDriver $site, $startPage) { if (!$site->validate()) { return; } $content = $site->getContent($site); /** @var \simple_html_dom $simpleDom */ $simpleDom = HtmlDomParser::str_get_html($content); /** @var \simple_html_dom_node[] $links */ $links = $simpleDom->find("a"); foreach ($links as $link) { $siteUrl = $link->getAttribute("href"); if (!$siteUrl) { continue; } $siteUrl = str_replace($startPage, '', $siteUrl); if (!Strings::startsWith($siteUrl, '/')) { continue; } $siteUrl = str_replace($site->getUrl(), "", $siteUrl); $siteUrl = rtrim($siteUrl, "/"); $newSite = $site::fromUrl($startPage . $siteUrl); if (isset($this->scannedUrls[$newSite->getUrl()])) { continue; } $this->unscannedUrls[$newSite->getUrl()] = $newSite; } $this->scannedUrls[$site->getUrl()] = $site; unset($this->unscannedUrls[$site->getUrl()]); }