Beispiel #1
0
 public function __construct(SpiderStatus $status = null)
 {
     if ($status) {
         $json = array('empty' => false, 'current' => $status->getCurrent(), 'visited' => $status->getVisited(), 'gathered' => $status->getGathered(), 'elapsed' => $status->getElapsedTime(), 'finished' => $status->isFinished());
     } else {
         $json = array('empty' => true);
     }
     parent::__construct($json);
 }
 public function statusAction($id)
 {
     $fileStatus = new File(self::PATH_DATA, $id . self::SUFFIX_STATUS);
     if ($fileStatus->exists()) {
         $status = new SpiderStatus();
         $status->read($fileStatus);
     } else {
         $status = null;
     }
     $view = new StatusView($status);
     $this->response->setView($view);
 }
 /**
  * @dataProvider providerGetElapsedTime
  */
 public function testGetElapsedTime($expected, $start, $stop)
 {
     $status = new SpiderStatus(null, 0, 0, $start, $stop);
     $result = $status->getElapsedTime();
     $this->assertEquals($expected, $result);
 }
Beispiel #4
0
 /**
  * Starts the crawling
  * @param integer $delay Delay between each page in miliseconds
  * @param zibo\library\filesystem\File $statusFile File where the status of the crawling process is written
  * @param zibo\library\filesystem\File $cancelFile File which will cancel/stop the crawling process when exists
  * @return null
  */
 public function crawl($delay = 1000, File $statusFile = null, File $cancelFile = null)
 {
     $prey = $this->web->resetPrey();
     $start = time();
     $index = 0;
     $isCancelled = false;
     while ($prey) {
         if ($cancelFile && $cancelFile->exists()) {
             $cancelFile->delete();
             $isCancelled = true;
             break;
         }
         usleep($delay * 1000);
         $index++;
         $url = $prey->getUrl();
         if ($this->shouldIgnore($url)) {
             $prey->addType(WebNode::TYPE_IGNORED);
             $prey = $this->web->getNextPrey();
             continue;
         }
         if ($statusFile) {
             $status = new SpiderStatus($url, $index, $this->web->countNodes(), $start);
             $status->write($statusFile);
         }
         if (String::startsWith($url, 'mailto:')) {
             $prey->addType(WebNode::TYPE_MAILTO);
             $prey = $this->web->getNextPrey();
             continue;
         }
         try {
             $crawl = new Crawl($url);
             $crawl->performCrawl();
             $response = $crawl->getResponse();
             $prey->setResponse($response);
             if ($response->isRedirect()) {
                 $location = $response->getHeader('Location');
                 if (!String::looksLikeUrl($location)) {
                     if ($location[0] == '/') {
                         $base = $crawl->getBaseUrl();
                     } else {
                         $base = $crawl->getBasePath();
                     }
                     $location = rtrim($base, '/') . '/' . ltrim($location, '/');
                 }
                 if ($url == $location) {
                     throw new Exception('Redirect loop');
                 }
                 $locationNode = $this->web->getNode($location);
                 $locationNode->addReference($prey);
                 $prey->addLink($locationNode);
             }
             if (!String::startsWith($url, $this->baseUrl)) {
                 $prey->addType(WebNode::TYPE_EXTERNAL);
                 if (!$this->willBiteExternalNodes) {
                     $prey = $this->web->getNextPrey();
                     continue;
                 }
             }
             $this->bite($prey, $crawl->getBaseUrl(), $crawl->getBasePath());
         } catch (Exception $exception) {
             if ($crawl) {
                 $response = $crawl->getResponse();
                 if ($response) {
                     $prey->setResponse($response);
                 }
             }
             $prey->setError($exception->getMessage());
         }
         $prey = $this->web->getNextPrey();
     }
     if (!$isCancelled) {
         if ($statusFile) {
             $status = new SpiderStatus("reports", $index, $this->web->countNodes(), $start);
             $status->write($statusFile);
         }
         foreach ($this->reports as $report) {
             $report->setWeb($this->web);
         }
     }
     if ($statusFile) {
         $status = new SpiderStatus(null, $index, $this->web->countNodes(), $start, time());
         $status->write($statusFile);
     }
 }