public function __construct(SpiderStatus $status = null) { if ($status) { $json = array('empty' => false, 'current' => $status->getCurrent(), 'visited' => $status->getVisited(), 'gathered' => $status->getGathered(), 'elapsed' => $status->getElapsedTime(), 'finished' => $status->isFinished()); } else { $json = array('empty' => true); } parent::__construct($json); }
public function statusAction($id) { $fileStatus = new File(self::PATH_DATA, $id . self::SUFFIX_STATUS); if ($fileStatus->exists()) { $status = new SpiderStatus(); $status->read($fileStatus); } else { $status = null; } $view = new StatusView($status); $this->response->setView($view); }
/** * @dataProvider providerGetElapsedTime */ public function testGetElapsedTime($expected, $start, $stop) { $status = new SpiderStatus(null, 0, 0, $start, $stop); $result = $status->getElapsedTime(); $this->assertEquals($expected, $result); }
/** * Starts the crawling * @param integer $delay Delay between each page in miliseconds * @param zibo\library\filesystem\File $statusFile File where the status of the crawling process is written * @param zibo\library\filesystem\File $cancelFile File which will cancel/stop the crawling process when exists * @return null */ public function crawl($delay = 1000, File $statusFile = null, File $cancelFile = null) { $prey = $this->web->resetPrey(); $start = time(); $index = 0; $isCancelled = false; while ($prey) { if ($cancelFile && $cancelFile->exists()) { $cancelFile->delete(); $isCancelled = true; break; } usleep($delay * 1000); $index++; $url = $prey->getUrl(); if ($this->shouldIgnore($url)) { $prey->addType(WebNode::TYPE_IGNORED); $prey = $this->web->getNextPrey(); continue; } if ($statusFile) { $status = new SpiderStatus($url, $index, $this->web->countNodes(), $start); $status->write($statusFile); } if (String::startsWith($url, 'mailto:')) { $prey->addType(WebNode::TYPE_MAILTO); $prey = $this->web->getNextPrey(); continue; } try { $crawl = new Crawl($url); $crawl->performCrawl(); $response = $crawl->getResponse(); $prey->setResponse($response); if ($response->isRedirect()) { $location = $response->getHeader('Location'); if (!String::looksLikeUrl($location)) { if ($location[0] == '/') { $base = $crawl->getBaseUrl(); } else { $base = $crawl->getBasePath(); } $location = rtrim($base, '/') . '/' . ltrim($location, '/'); } if ($url == $location) { throw new Exception('Redirect loop'); } $locationNode = $this->web->getNode($location); $locationNode->addReference($prey); $prey->addLink($locationNode); } if (!String::startsWith($url, $this->baseUrl)) { $prey->addType(WebNode::TYPE_EXTERNAL); if (!$this->willBiteExternalNodes) { $prey = $this->web->getNextPrey(); continue; } } $this->bite($prey, $crawl->getBaseUrl(), $crawl->getBasePath()); } catch (Exception $exception) { if ($crawl) { $response = $crawl->getResponse(); if ($response) { $prey->setResponse($response); } } $prey->setError($exception->getMessage()); } $prey = $this->web->getNextPrey(); } if (!$isCancelled) { if ($statusFile) { $status = new SpiderStatus("reports", $index, $this->web->countNodes(), $start); $status->write($statusFile); } foreach ($this->reports as $report) { $report->setWeb($this->web); } } if ($statusFile) { $status = new SpiderStatus(null, $index, $this->web->countNodes(), $start, time()); $status->write($statusFile); } }