Exemple #1
0
 public function crawl($startPage, HtmlPage $doc, ParsedPagesList $parsedPages)
 {
     $linkStack[] = $startPage;
     while (count($linkStack) > 0) {
         $url = array_shift($linkStack);
         if ($parsedPages->isUrlInList($url)) {
             continue;
         }
         echo sprintf('==> Crawling %s ...', urldecode($url));
         $startTime = microtime(true);
         $html = CurlWrapper::get($url);
         if ($html === false) {
             echo 'Failed due to curl error' . PHP_EOL;
             continue;
         }
         $doc->load($html);
         $imageCount = $doc->getImgTagCount();
         $parseTime = $this->convertTimeToMiliseconds(microtime(true) - $startTime);
         $parsedPages->add(new Record($url, $imageCount, $parseTime));
         echo sprintf("Done (%d)\n", memory_get_usage());
         $links = $doc->getLinks();
         foreach ($links as $link) {
             $href = $doc->getHrefOfLink($link);
             if (UrlTools::isLocalLink($href)) {
                 $href = $startPage . UrlTools::extractLocalPath($href);
             }
             if (UrlTools::isDomainLink($startPage, $href)) {
                 $linkStack[] = $href;
             }
         }
     }
 }