Example #1
0
 public static function storePage(Page $page, $rootUrl)
 {
     $mysqli = self::connect();
     $query = "INSERT INTO news_retriever_page (date, title, siteroot, pageurl, content) VALUES(?,?,?,?,?)";
     $statement = $mysqli->prepare($query);
     $date = $page->getDate()->format('Y-m-d');
     $title = $page->getTitle();
     $content = $page->getPageAsHtml();
     $statement->bind_param('sssss', $date, $title, $rootUrl, $page->getPageUrl(), $content);
     $statement->execute();
     $mysqli->close();
 }
Example #2
0
 function crawlStep($isRootStep = false)
 {
     $now = new DateTime();
     $timeLeft = $this->startDate->getTimestamp() + $this->timeToLive * 60 - $now->getTimestamp();
     echo "{$timeLeft} seconds left in this crawl session\n";
     if ($timeLeft <= 0) {
         return;
     }
     echo "Frontier size: " . count($this->urlFrontier) . "\n";
     if (count($this->urlFrontier) < 1) {
         return;
     }
     $url = "";
     while ($url === "" || in_array($url, $this->ignore, true)) {
         if (count($this->urlFrontier) < 1) {
             return;
         }
         $url = $this->urlFrontier[0];
         array_splice($this->urlFrontier, 0, 1);
     }
     echo "retrieving page at {$url}\n";
     $this->lastCrawlTime = new DateTime();
     $page = new Page($url, $this);
     if ($page) {
         array_push($this->crawled, $page->getPageUrl());
         array_push($this->crawledUrlEnds, Util::getUrlEnd($page->getPageUrl()));
         $links = $page->getLinks();
         $pushToFrontier = array();
         foreach ($links as $link) {
             if (in_array($link, $this->urlFrontier, true) || in_array($link, $this->crawled, true) || in_array($link, $this->ignore, true)) {
                 continue;
             }
             if (!filter_var($link, FILTER_VALIDATE_URL)) {
                 continue;
             } else {
                 if (!filter_var($link, FILTER_VALIDATE_URL)) {
                     continue;
                 }
             }
             array_push($pushToFrontier, $link);
         }
         if ($page->getDate() >= $this->dateLimitLow && $page->getDate() <= $this->dateLimitHigh) {
             foreach ($pushToFrontier as $value) {
                 array_unshift($this->urlFrontier, $value);
             }
             if (!$isRootStep) {
                 echo "storing {$url}\n";
                 Database::storePage($page, $this->rootUrl);
             }
         } else {
             foreach ($pushToFrontier as $value) {
                 array_push($this->urlFrontier, $value);
             }
             array_push($this->ignore, $page->getPageUrl());
         }
     }
     $now = new DateTime();
     $nowStamp = $now->getTimestamp();
     $lastStamp = $this->lastCrawlTime->getTimestamp();
     if ($lastStamp + self::$WAIT_BETWEEN_CRAWLS > $nowStamp) {
         $seconds = $lastStamp + self::$WAIT_BETWEEN_CRAWLS - $nowStamp;
         echo "wait {$seconds} seconds\n";
         sleep($seconds);
     }
     echo "next crawlstep\n";
     $this->crawlStep();
 }
Example #3
0
 public function testPage()
 {
     $date = new \DateTime();
     $page = new Page(123, 1, 'A page', 'Some content', 2, $date, 0, 25);
     $this->assertEquals(123, $page->getID());
     $this->assertEquals('A page', $page->getTitle());
     $this->assertEquals('Some content', $page->getContent());
     $this->assertEquals(2, $page->getVisibility());
     $this->assertEquals($date, $page->getDate());
     $this->assertEquals(25, $page->getPosition());
 }