public function crawl($startPage, HtmlPage $doc, ParsedPagesList $parsedPages) { $linkStack[] = $startPage; while (count($linkStack) > 0) { $url = array_shift($linkStack); if ($parsedPages->isUrlInList($url)) { continue; } echo sprintf('==> Crawling %s ...', urldecode($url)); $startTime = microtime(true); $html = CurlWrapper::get($url); if ($html === false) { echo 'Failed due to curl error' . PHP_EOL; continue; } $doc->load($html); $imageCount = $doc->getImgTagCount(); $parseTime = $this->convertTimeToMiliseconds(microtime(true) - $startTime); $parsedPages->add(new Record($url, $imageCount, $parseTime)); echo sprintf("Done (%d)\n", memory_get_usage()); $links = $doc->getLinks(); foreach ($links as $link) { $href = $doc->getHrefOfLink($link); if (UrlTools::isLocalLink($href)) { $href = $startPage . UrlTools::extractLocalPath($href); } if (UrlTools::isDomainLink($startPage, $href)) { $linkStack[] = $href; } } } }