Пример #1
0
 /**
  * Verifica se o documento é novo
  * @todo implementar get Raw
  */
 public static function isFresh($body, InterfaceLink $link, InterfaceSubscription $subscription)
 {
     if ($existent = $subscription->getLink($link->getId())) {
         if (SpiderText::diffPercentage($existent, $this->getBody()) < $this->getConfig('requirement_diff', 40)) {
             return false;
         }
     }
     return true;
 }
Пример #2
0
 protected function collect(InterfaceLink $target, $withLinks = false)
 {
     $URI = $target->getHref();
     $type = 'html';
     if ($target instanceof InterfaceSubscription) {
         $this->logger("Subscription Type: " . $target->getSourceType());
         $type = $target->getSourceType();
     }
     try {
         if (!SpiderAsserts::isDocumentHref($URI)) {
             $this->logger('URI wrong:[' . $URI . ']', 'err', 3);
             $this->pool->errLink($target, 'invalid URL');
             return false;
         }
         // verify that this has been processed
         if (!$target instanceof InterfaceSubscription && $this->isDone($URI)) {
             $this->logger('URI is Done:[' . $URI . ']', 'info', 1);
             return false;
         }
         try {
             $crawler = $this->getCrawler($URI, 'GET', $type);
         } catch (\Exception $e) {
             $this->logger('Collect Exception', 'err', 3);
             $this->logger($e->getMessage(), 'err', 3);
             if ($this->requests === 0) {
                 $this->errors++;
                 throw new \Exception('Error in the first request:' . $e->getMessage());
             }
         }
         if (!isset($crawler)) {
             $this->logger('Crawler broken', 'err');
             $this->pool->errLink($target, 'impossible crawler');
             return false;
         }
         if (!$target instanceof InterfaceSubscription) {
             if (DocumentManager::isFresh($this->getBody(), $target, $this->getSubscription())) {
                 $target->setDocument($this->getCurrentUri(), clone $crawler, $this->getSubscription(), $this->transferDependency());
                 $this->logger('document IS fresh', 'info', 5);
             } else {
                 $this->logger('document isnt fresh');
             }
         }
         $target->setStatus(1);
         //done!
         if ($withLinks) {
             $this->logger('go to the scan more links!', 'info', 5);
             try {
                 $target->set('hyperlinks', $this->collectLinks($crawler, $type));
             } catch (\Exception $e) {
                 $this->logger($e->getMessage(), 'err');
                 $this->errors++;
             }
         }
         $this->logger('saving object on cache, with id:' . $target->getId('string'), 'info', 5);
         $this->pool->save($target);
         $this->success++;
         return true;
     } catch (\Zend\Http\Exception\InvalidArgumentException $e) {
         $this->logger('Invalid argument on [' . $URI . ']', 'err');
         $this->pool->errLink($target, 'invalid argument on HTTP request');
         $this->errors++;
         throw new \Exception('Invalid argument');
     } catch (\Zend\Http\Client\Adapter\Exception\RuntimeException $e) {
         $this->logger('Http Client Runtime error on  [' . $URI . ']', 'err');
         $this->pool->errLink($target, 'Runtime error on Http Client Adaper');
         $this->errors++;
         return false;
     }
 }
Пример #3
0
 public function errLink(InterfaceLink $link, $cause = 'undefined')
 {
     $link->setStatus(3);
     $this->_save($link);
     $this->logger($link->get('href') . "\n" . ' marked with error.' . 'Cause: ' . $cause . "\n");
     $this->errors++;
 }
Пример #4
0
 public static function isDocumentLink(InterfaceLink $link)
 {
     return self::isDocumentHref($link->getHref());
 }