コード例 #1
0
ファイル: Document.php プロジェクト: gpupo/camelspider
 /**
  * Faz query no documento, de acordo com os parâmetros definidos
  * na assinatura e define a relevância, sendo que esta relevância 
  * pode ser:
  *  1) Possivelmente contém conteúdo
  *  2) Contém conteúdo e contém uma ou mais palavras chave desejadas 
  *  pela assinatura ou não contém palavras indesejadas
  *  3) Contém conteúdo, contém palavras desejadas e não contém 
  *  palavras indesejadas
  **/
 protected function setRelevancy()
 {
     if (!$this->bigger) {
         $this->logger('Content too short', 'info', 5);
         return false;
     }
     $this->addRelevancy();
     //+1 cause text exist
     $txt = $this->getTitle() . "\n" . $this->getText();
     $this->logger("Text to be verified:\n" . $txt . "\n", 'info', 5);
     //diseribles keywords filter
     if (is_null($this->subscription->getFilter('contain'))) {
         $this->addRelevancy();
         $this->logger('ignore keywords filter', 'info', 5);
     } else {
         //Contain?
         $this->logger('Check for keywords[' . implode(',', $this->subscription->getFilter('contain')) . ']', 'info', 4);
         $containTest = SpiderAsserts::containKeywords($txt, (array) $this->subscription->getFilter('contain'), true);
         if ($containTest) {
             $this->addRelevancy();
         } else {
             $this->logger('Document not contain keywords', 'info', 5);
         }
     }
     //Bad words
     if (is_null($this->subscription->getFilter('notContain'))) {
         $this->addRelevancy();
         $this->logger('ignore Bad keywords filter', 'info', 5);
     } else {
         //Not Contain?
         $this->logger('Check for BAD keywords[' . implode(',', $this->subscription->getFilter('notContain')) . ']', 'info', 5);
         if (!SpiderAsserts::containKeywords($txt, $this->subscription->getFilter('notContain'), false)) {
             $this->addRelevancy();
         } else {
             $this->logger('Document contain BAD keywords', 'info', 5);
         }
     }
 }
コード例 #2
0
ファイル: Indexer.php プロジェクト: gpupo/camelspider
 protected function collect(InterfaceLink $target, $withLinks = false)
 {
     $URI = $target->getHref();
     $type = 'html';
     if ($target instanceof InterfaceSubscription) {
         $this->logger("Subscription Type: " . $target->getSourceType());
         $type = $target->getSourceType();
     }
     try {
         if (!SpiderAsserts::isDocumentHref($URI)) {
             $this->logger('URI wrong:[' . $URI . ']', 'err', 3);
             $this->pool->errLink($target, 'invalid URL');
             return false;
         }
         // verify that this has been processed
         if (!$target instanceof InterfaceSubscription && $this->isDone($URI)) {
             $this->logger('URI is Done:[' . $URI . ']', 'info', 1);
             return false;
         }
         try {
             $crawler = $this->getCrawler($URI, 'GET', $type);
         } catch (\Exception $e) {
             $this->logger('Collect Exception', 'err', 3);
             $this->logger($e->getMessage(), 'err', 3);
             if ($this->requests === 0) {
                 $this->errors++;
                 throw new \Exception('Error in the first request:' . $e->getMessage());
             }
         }
         if (!isset($crawler)) {
             $this->logger('Crawler broken', 'err');
             $this->pool->errLink($target, 'impossible crawler');
             return false;
         }
         if (!$target instanceof InterfaceSubscription) {
             if (DocumentManager::isFresh($this->getBody(), $target, $this->getSubscription())) {
                 $target->setDocument($this->getCurrentUri(), clone $crawler, $this->getSubscription(), $this->transferDependency());
                 $this->logger('document IS fresh', 'info', 5);
             } else {
                 $this->logger('document isnt fresh');
             }
         }
         $target->setStatus(1);
         //done!
         if ($withLinks) {
             $this->logger('go to the scan more links!', 'info', 5);
             try {
                 $target->set('hyperlinks', $this->collectLinks($crawler, $type));
             } catch (\Exception $e) {
                 $this->logger($e->getMessage(), 'err');
                 $this->errors++;
             }
         }
         $this->logger('saving object on cache, with id:' . $target->getId('string'), 'info', 5);
         $this->pool->save($target);
         $this->success++;
         return true;
     } catch (\Zend\Http\Exception\InvalidArgumentException $e) {
         $this->logger('Invalid argument on [' . $URI . ']', 'err');
         $this->pool->errLink($target, 'invalid argument on HTTP request');
         $this->errors++;
         throw new \Exception('Invalid argument');
     } catch (\Zend\Http\Client\Adapter\Exception\RuntimeException $e) {
         $this->logger('Http Client Runtime error on  [' . $URI . ']', 'err');
         $this->pool->errLink($target, 'Runtime error on Http Client Adaper');
         $this->errors++;
         return false;
     }
 }