Example #1
0
 /**
  * @dataProvider providerCrawl
  */
 public function testCrawl($error, $responseCode, $redirect, $hasContent, $baseUrl, $url)
 {
     $crawl = new Crawl($url);
     $crawl->performCrawl();
     $this->assertEquals($baseUrl, $crawl->getBaseUrl());
     $response = $crawl->getResponse();
     if ($responseCode) {
         $this->assertNotNull($response);
         $this->assertEquals($responseCode, $response->getResponseCode());
         $this->assertEquals($redirect, $response->isRedirect());
         if ($hasContent) {
             $this->assertNotNull($response->getContent());
         } else {
             $this->assertNull($response->getContent());
         }
     } else {
         $this->assertNull($response);
     }
 }
Example #2
0
 /**
  * Starts the crawling
  * @param integer $delay Delay between each page in miliseconds
  * @param zibo\library\filesystem\File $statusFile File where the status of the crawling process is written
  * @param zibo\library\filesystem\File $cancelFile File which will cancel/stop the crawling process when exists
  * @return null
  */
 public function crawl($delay = 1000, File $statusFile = null, File $cancelFile = null)
 {
     $prey = $this->web->resetPrey();
     $start = time();
     $index = 0;
     $isCancelled = false;
     while ($prey) {
         if ($cancelFile && $cancelFile->exists()) {
             $cancelFile->delete();
             $isCancelled = true;
             break;
         }
         usleep($delay * 1000);
         $index++;
         $url = $prey->getUrl();
         if ($this->shouldIgnore($url)) {
             $prey->addType(WebNode::TYPE_IGNORED);
             $prey = $this->web->getNextPrey();
             continue;
         }
         if ($statusFile) {
             $status = new SpiderStatus($url, $index, $this->web->countNodes(), $start);
             $status->write($statusFile);
         }
         if (String::startsWith($url, 'mailto:')) {
             $prey->addType(WebNode::TYPE_MAILTO);
             $prey = $this->web->getNextPrey();
             continue;
         }
         try {
             $crawl = new Crawl($url);
             $crawl->performCrawl();
             $response = $crawl->getResponse();
             $prey->setResponse($response);
             if ($response->isRedirect()) {
                 $location = $response->getHeader('Location');
                 if (!String::looksLikeUrl($location)) {
                     if ($location[0] == '/') {
                         $base = $crawl->getBaseUrl();
                     } else {
                         $base = $crawl->getBasePath();
                     }
                     $location = rtrim($base, '/') . '/' . ltrim($location, '/');
                 }
                 if ($url == $location) {
                     throw new Exception('Redirect loop');
                 }
                 $locationNode = $this->web->getNode($location);
                 $locationNode->addReference($prey);
                 $prey->addLink($locationNode);
             }
             if (!String::startsWith($url, $this->baseUrl)) {
                 $prey->addType(WebNode::TYPE_EXTERNAL);
                 if (!$this->willBiteExternalNodes) {
                     $prey = $this->web->getNextPrey();
                     continue;
                 }
             }
             $this->bite($prey, $crawl->getBaseUrl(), $crawl->getBasePath());
         } catch (Exception $exception) {
             if ($crawl) {
                 $response = $crawl->getResponse();
                 if ($response) {
                     $prey->setResponse($response);
                 }
             }
             $prey->setError($exception->getMessage());
         }
         $prey = $this->web->getNextPrey();
     }
     if (!$isCancelled) {
         if ($statusFile) {
             $status = new SpiderStatus("reports", $index, $this->web->countNodes(), $start);
             $status->write($statusFile);
         }
         foreach ($this->reports as $report) {
             $report->setWeb($this->web);
         }
     }
     if ($statusFile) {
         $status = new SpiderStatus(null, $index, $this->web->countNodes(), $start, time());
         $status->write($statusFile);
     }
 }