public static function storePage(Page $page, $rootUrl) { $mysqli = self::connect(); $query = "INSERT INTO news_retriever_page (date, title, siteroot, pageurl, content) VALUES(?,?,?,?,?)"; $statement = $mysqli->prepare($query); $date = $page->getDate()->format('Y-m-d'); $title = $page->getTitle(); $content = $page->getPageAsHtml(); $statement->bind_param('sssss', $date, $title, $rootUrl, $page->getPageUrl(), $content); $statement->execute(); $mysqli->close(); }
function crawlStep($isRootStep = false) { $now = new DateTime(); $timeLeft = $this->startDate->getTimestamp() + $this->timeToLive * 60 - $now->getTimestamp(); echo "{$timeLeft} seconds left in this crawl session\n"; if ($timeLeft <= 0) { return; } echo "Frontier size: " . count($this->urlFrontier) . "\n"; if (count($this->urlFrontier) < 1) { return; } $url = ""; while ($url === "" || in_array($url, $this->ignore, true)) { if (count($this->urlFrontier) < 1) { return; } $url = $this->urlFrontier[0]; array_splice($this->urlFrontier, 0, 1); } echo "retrieving page at {$url}\n"; $this->lastCrawlTime = new DateTime(); $page = new Page($url, $this); if ($page) { array_push($this->crawled, $page->getPageUrl()); array_push($this->crawledUrlEnds, Util::getUrlEnd($page->getPageUrl())); $links = $page->getLinks(); $pushToFrontier = array(); foreach ($links as $link) { if (in_array($link, $this->urlFrontier, true) || in_array($link, $this->crawled, true) || in_array($link, $this->ignore, true)) { continue; } if (!filter_var($link, FILTER_VALIDATE_URL)) { continue; } else { if (!filter_var($link, FILTER_VALIDATE_URL)) { continue; } } array_push($pushToFrontier, $link); } if ($page->getDate() >= $this->dateLimitLow && $page->getDate() <= $this->dateLimitHigh) { foreach ($pushToFrontier as $value) { array_unshift($this->urlFrontier, $value); } if (!$isRootStep) { echo "storing {$url}\n"; Database::storePage($page, $this->rootUrl); } } else { foreach ($pushToFrontier as $value) { array_push($this->urlFrontier, $value); } array_push($this->ignore, $page->getPageUrl()); } } $now = new DateTime(); $nowStamp = $now->getTimestamp(); $lastStamp = $this->lastCrawlTime->getTimestamp(); if ($lastStamp + self::$WAIT_BETWEEN_CRAWLS > $nowStamp) { $seconds = $lastStamp + self::$WAIT_BETWEEN_CRAWLS - $nowStamp; echo "wait {$seconds} seconds\n"; sleep($seconds); } echo "next crawlstep\n"; $this->crawlStep(); }
/** * ページ機能用URLを取得する * * @param array $page * @return string */ function url($page) { return $this->Page->getPageUrl($page); }
<?php //This file is only used for testing include 'Main.php'; $main = new Main(); echo "This file is only used for testing<br/>"; echo '<pre>'; $rootUrl = filter_input(INPUT_GET, 'rootUrl'); if (!$rootUrl) { return; } $crawler = new Crawler($rootUrl, 15); $page = new Page($rootUrl, $crawler); $page1 = new Page($page->getLinks()[0], $crawler); $page2 = new Page($page->getLinks()[1], $crawler); $page3 = new Page($page->getLinks()[2], $crawler); $page4 = new Page($page->getLinks()[3], $crawler); echo "url: " . $page->getPageUrl() . "| title: " . $page->getTitle() . "\n"; echo "url: " . $page1->getPageUrl() . "| title: " . $page1->getTitle() . "\n"; echo "url: " . $page2->getPageUrl() . "| title: " . $page2->getTitle() . "\n"; echo "url: " . $page3->getPageUrl() . "| title: " . $page3->getTitle() . "\n"; echo "url: " . $page4->getPageUrl() . "| title: " . $page4->getTitle() . "\n"; echo '</pre>';
/** * ページのURLを取得する * * @param array $name ページ名 * @param array $categoryId ページカテゴリーID * @param array $expected 期待値 * @param string $message テストが失敗した時に表示されるメッセージ * @dataProvider getPageUrlDataProvider */ public function testGetPageUrl($name, $categoryId, $expected, $message = null) { $data = array('Page' => array('name' => $name, 'page_category_id' => $categoryId)); $result = $this->Page->getPageUrl($data); $this->assertEquals($expected, $result, $message); }