Example #1
0
 function crawlStep($isRootStep = false)
 {
     $now = new DateTime();
     $timeLeft = $this->startDate->getTimestamp() + $this->timeToLive * 60 - $now->getTimestamp();
     echo "{$timeLeft} seconds left in this crawl session\n";
     if ($timeLeft <= 0) {
         return;
     }
     echo "Frontier size: " . count($this->urlFrontier) . "\n";
     if (count($this->urlFrontier) < 1) {
         return;
     }
     $url = "";
     while ($url === "" || in_array($url, $this->ignore, true)) {
         if (count($this->urlFrontier) < 1) {
             return;
         }
         $url = $this->urlFrontier[0];
         array_splice($this->urlFrontier, 0, 1);
     }
     echo "retrieving page at {$url}\n";
     $this->lastCrawlTime = new DateTime();
     $page = new Page($url, $this);
     if ($page) {
         array_push($this->crawled, $page->getPageUrl());
         array_push($this->crawledUrlEnds, Util::getUrlEnd($page->getPageUrl()));
         $links = $page->getLinks();
         $pushToFrontier = array();
         foreach ($links as $link) {
             if (in_array($link, $this->urlFrontier, true) || in_array($link, $this->crawled, true) || in_array($link, $this->ignore, true)) {
                 continue;
             }
             if (!filter_var($link, FILTER_VALIDATE_URL)) {
                 continue;
             } else {
                 if (!filter_var($link, FILTER_VALIDATE_URL)) {
                     continue;
                 }
             }
             array_push($pushToFrontier, $link);
         }
         if ($page->getDate() >= $this->dateLimitLow && $page->getDate() <= $this->dateLimitHigh) {
             foreach ($pushToFrontier as $value) {
                 array_unshift($this->urlFrontier, $value);
             }
             if (!$isRootStep) {
                 echo "storing {$url}\n";
                 Database::storePage($page, $this->rootUrl);
             }
         } else {
             foreach ($pushToFrontier as $value) {
                 array_push($this->urlFrontier, $value);
             }
             array_push($this->ignore, $page->getPageUrl());
         }
     }
     $now = new DateTime();
     $nowStamp = $now->getTimestamp();
     $lastStamp = $this->lastCrawlTime->getTimestamp();
     if ($lastStamp + self::$WAIT_BETWEEN_CRAWLS > $nowStamp) {
         $seconds = $lastStamp + self::$WAIT_BETWEEN_CRAWLS - $nowStamp;
         echo "wait {$seconds} seconds\n";
         sleep($seconds);
     }
     echo "next crawlstep\n";
     $this->crawlStep();
 }