Example #1
0
 public static function storePage(Page $page, $rootUrl)
 {
     $mysqli = self::connect();
     $query = "INSERT INTO news_retriever_page (date, title, siteroot, pageurl, content) VALUES(?,?,?,?,?)";
     $statement = $mysqli->prepare($query);
     $date = $page->getDate()->format('Y-m-d');
     $title = $page->getTitle();
     $content = $page->getPageAsHtml();
     $statement->bind_param('sssss', $date, $title, $rootUrl, $page->getPageUrl(), $content);
     $statement->execute();
     $mysqli->close();
 }
Example #2
0
 function crawlStep($isRootStep = false)
 {
     $now = new DateTime();
     $timeLeft = $this->startDate->getTimestamp() + $this->timeToLive * 60 - $now->getTimestamp();
     echo "{$timeLeft} seconds left in this crawl session\n";
     if ($timeLeft <= 0) {
         return;
     }
     echo "Frontier size: " . count($this->urlFrontier) . "\n";
     if (count($this->urlFrontier) < 1) {
         return;
     }
     $url = "";
     while ($url === "" || in_array($url, $this->ignore, true)) {
         if (count($this->urlFrontier) < 1) {
             return;
         }
         $url = $this->urlFrontier[0];
         array_splice($this->urlFrontier, 0, 1);
     }
     echo "retrieving page at {$url}\n";
     $this->lastCrawlTime = new DateTime();
     $page = new Page($url, $this);
     if ($page) {
         array_push($this->crawled, $page->getPageUrl());
         array_push($this->crawledUrlEnds, Util::getUrlEnd($page->getPageUrl()));
         $links = $page->getLinks();
         $pushToFrontier = array();
         foreach ($links as $link) {
             if (in_array($link, $this->urlFrontier, true) || in_array($link, $this->crawled, true) || in_array($link, $this->ignore, true)) {
                 continue;
             }
             if (!filter_var($link, FILTER_VALIDATE_URL)) {
                 continue;
             } else {
                 if (!filter_var($link, FILTER_VALIDATE_URL)) {
                     continue;
                 }
             }
             array_push($pushToFrontier, $link);
         }
         if ($page->getDate() >= $this->dateLimitLow && $page->getDate() <= $this->dateLimitHigh) {
             foreach ($pushToFrontier as $value) {
                 array_unshift($this->urlFrontier, $value);
             }
             if (!$isRootStep) {
                 echo "storing {$url}\n";
                 Database::storePage($page, $this->rootUrl);
             }
         } else {
             foreach ($pushToFrontier as $value) {
                 array_push($this->urlFrontier, $value);
             }
             array_push($this->ignore, $page->getPageUrl());
         }
     }
     $now = new DateTime();
     $nowStamp = $now->getTimestamp();
     $lastStamp = $this->lastCrawlTime->getTimestamp();
     if ($lastStamp + self::$WAIT_BETWEEN_CRAWLS > $nowStamp) {
         $seconds = $lastStamp + self::$WAIT_BETWEEN_CRAWLS - $nowStamp;
         echo "wait {$seconds} seconds\n";
         sleep($seconds);
     }
     echo "next crawlstep\n";
     $this->crawlStep();
 }
Example #3
0
 /**
  * ページ機能用URLを取得する
  * 
  * @param array $page
  * @return string
  */
 function url($page)
 {
     return $this->Page->getPageUrl($page);
 }
Example #4
0
<?php

//This file is only used for testing
include 'Main.php';
$main = new Main();
echo "This file is only used for testing<br/>";
echo '<pre>';
$rootUrl = filter_input(INPUT_GET, 'rootUrl');
if (!$rootUrl) {
    return;
}
$crawler = new Crawler($rootUrl, 15);
$page = new Page($rootUrl, $crawler);
$page1 = new Page($page->getLinks()[0], $crawler);
$page2 = new Page($page->getLinks()[1], $crawler);
$page3 = new Page($page->getLinks()[2], $crawler);
$page4 = new Page($page->getLinks()[3], $crawler);
echo "url: " . $page->getPageUrl() . "| title: " . $page->getTitle() . "\n";
echo "url: " . $page1->getPageUrl() . "| title: " . $page1->getTitle() . "\n";
echo "url: " . $page2->getPageUrl() . "| title: " . $page2->getTitle() . "\n";
echo "url: " . $page3->getPageUrl() . "| title: " . $page3->getTitle() . "\n";
echo "url: " . $page4->getPageUrl() . "| title: " . $page4->getTitle() . "\n";
echo '</pre>';
Example #5
0
 /**
  * ページのURLを取得する
  * 
  * @param array $name ページ名
  * @param array $categoryId ページカテゴリーID
  * @param array $expected 期待値
  * @param string $message テストが失敗した時に表示されるメッセージ
  * @dataProvider getPageUrlDataProvider
  */
 public function testGetPageUrl($name, $categoryId, $expected, $message = null)
 {
     $data = array('Page' => array('name' => $name, 'page_category_id' => $categoryId));
     $result = $this->Page->getPageUrl($data);
     $this->assertEquals($expected, $result, $message);
 }