/** * @dataProvider providerCrawl */ public function testCrawl($error, $responseCode, $redirect, $hasContent, $baseUrl, $url) { $crawl = new Crawl($url); $crawl->performCrawl(); $this->assertEquals($baseUrl, $crawl->getBaseUrl()); $response = $crawl->getResponse(); if ($responseCode) { $this->assertNotNull($response); $this->assertEquals($responseCode, $response->getResponseCode()); $this->assertEquals($redirect, $response->isRedirect()); if ($hasContent) { $this->assertNotNull($response->getContent()); } else { $this->assertNull($response->getContent()); } } else { $this->assertNull($response); } }
/** * Starts the crawling * @param integer $delay Delay between each page in miliseconds * @param zibo\library\filesystem\File $statusFile File where the status of the crawling process is written * @param zibo\library\filesystem\File $cancelFile File which will cancel/stop the crawling process when exists * @return null */ public function crawl($delay = 1000, File $statusFile = null, File $cancelFile = null) { $prey = $this->web->resetPrey(); $start = time(); $index = 0; $isCancelled = false; while ($prey) { if ($cancelFile && $cancelFile->exists()) { $cancelFile->delete(); $isCancelled = true; break; } usleep($delay * 1000); $index++; $url = $prey->getUrl(); if ($this->shouldIgnore($url)) { $prey->addType(WebNode::TYPE_IGNORED); $prey = $this->web->getNextPrey(); continue; } if ($statusFile) { $status = new SpiderStatus($url, $index, $this->web->countNodes(), $start); $status->write($statusFile); } if (String::startsWith($url, 'mailto:')) { $prey->addType(WebNode::TYPE_MAILTO); $prey = $this->web->getNextPrey(); continue; } try { $crawl = new Crawl($url); $crawl->performCrawl(); $response = $crawl->getResponse(); $prey->setResponse($response); if ($response->isRedirect()) { $location = $response->getHeader('Location'); if (!String::looksLikeUrl($location)) { if ($location[0] == '/') { $base = $crawl->getBaseUrl(); } else { $base = $crawl->getBasePath(); } $location = rtrim($base, '/') . '/' . ltrim($location, '/'); } if ($url == $location) { throw new Exception('Redirect loop'); } $locationNode = $this->web->getNode($location); $locationNode->addReference($prey); $prey->addLink($locationNode); } if (!String::startsWith($url, $this->baseUrl)) { $prey->addType(WebNode::TYPE_EXTERNAL); if (!$this->willBiteExternalNodes) { $prey = $this->web->getNextPrey(); continue; } } $this->bite($prey, $crawl->getBaseUrl(), $crawl->getBasePath()); } catch (Exception $exception) { if ($crawl) { $response = $crawl->getResponse(); if ($response) { $prey->setResponse($response); } } $prey->setError($exception->getMessage()); } $prey = $this->web->getNextPrey(); } if (!$isCancelled) { if ($statusFile) { $status = new SpiderStatus("reports", $index, $this->web->countNodes(), $start); $status->write($statusFile); } foreach ($this->reports as $report) { $report->setWeb($this->web); } } if ($statusFile) { $status = new SpiderStatus(null, $index, $this->web->countNodes(), $start, time()); $status->write($statusFile); } }