public static function storePage(Page $page, $rootUrl) { $mysqli = self::connect(); $query = "INSERT INTO news_retriever_page (date, title, siteroot, pageurl, content) VALUES(?,?,?,?,?)"; $statement = $mysqli->prepare($query); $date = $page->getDate()->format('Y-m-d'); $title = $page->getTitle(); $content = $page->getPageAsHtml(); $statement->bind_param('sssss', $date, $title, $rootUrl, $page->getPageUrl(), $content); $statement->execute(); $mysqli->close(); }
function crawlStep($isRootStep = false) { $now = new DateTime(); $timeLeft = $this->startDate->getTimestamp() + $this->timeToLive * 60 - $now->getTimestamp(); echo "{$timeLeft} seconds left in this crawl session\n"; if ($timeLeft <= 0) { return; } echo "Frontier size: " . count($this->urlFrontier) . "\n"; if (count($this->urlFrontier) < 1) { return; } $url = ""; while ($url === "" || in_array($url, $this->ignore, true)) { if (count($this->urlFrontier) < 1) { return; } $url = $this->urlFrontier[0]; array_splice($this->urlFrontier, 0, 1); } echo "retrieving page at {$url}\n"; $this->lastCrawlTime = new DateTime(); $page = new Page($url, $this); if ($page) { array_push($this->crawled, $page->getPageUrl()); array_push($this->crawledUrlEnds, Util::getUrlEnd($page->getPageUrl())); $links = $page->getLinks(); $pushToFrontier = array(); foreach ($links as $link) { if (in_array($link, $this->urlFrontier, true) || in_array($link, $this->crawled, true) || in_array($link, $this->ignore, true)) { continue; } if (!filter_var($link, FILTER_VALIDATE_URL)) { continue; } else { if (!filter_var($link, FILTER_VALIDATE_URL)) { continue; } } array_push($pushToFrontier, $link); } if ($page->getDate() >= $this->dateLimitLow && $page->getDate() <= $this->dateLimitHigh) { foreach ($pushToFrontier as $value) { array_unshift($this->urlFrontier, $value); } if (!$isRootStep) { echo "storing {$url}\n"; Database::storePage($page, $this->rootUrl); } } else { foreach ($pushToFrontier as $value) { array_push($this->urlFrontier, $value); } array_push($this->ignore, $page->getPageUrl()); } } $now = new DateTime(); $nowStamp = $now->getTimestamp(); $lastStamp = $this->lastCrawlTime->getTimestamp(); if ($lastStamp + self::$WAIT_BETWEEN_CRAWLS > $nowStamp) { $seconds = $lastStamp + self::$WAIT_BETWEEN_CRAWLS - $nowStamp; echo "wait {$seconds} seconds\n"; sleep($seconds); } echo "next crawlstep\n"; $this->crawlStep(); }
public function testPage() { $date = new \DateTime(); $page = new Page(123, 1, 'A page', 'Some content', 2, $date, 0, 25); $this->assertEquals(123, $page->getID()); $this->assertEquals('A page', $page->getTitle()); $this->assertEquals('Some content', $page->getContent()); $this->assertEquals(2, $page->getVisibility()); $this->assertEquals($date, $page->getDate()); $this->assertEquals(25, $page->getPosition()); }