/** * Faz query no documento, de acordo com os parâmetros definidos * na assinatura e define a relevância, sendo que esta relevância * pode ser: * 1) Possivelmente contém conteúdo * 2) Contém conteúdo e contém uma ou mais palavras chave desejadas * pela assinatura ou não contém palavras indesejadas * 3) Contém conteúdo, contém palavras desejadas e não contém * palavras indesejadas **/ protected function setRelevancy() { if (!$this->bigger) { $this->logger('Content too short', 'info', 5); return false; } $this->addRelevancy(); //+1 cause text exist $txt = $this->getTitle() . "\n" . $this->getText(); $this->logger("Text to be verified:\n" . $txt . "\n", 'info', 5); //diseribles keywords filter if (is_null($this->subscription->getFilter('contain'))) { $this->addRelevancy(); $this->logger('ignore keywords filter', 'info', 5); } else { //Contain? $this->logger('Check for keywords[' . implode(',', $this->subscription->getFilter('contain')) . ']', 'info', 4); $containTest = SpiderAsserts::containKeywords($txt, (array) $this->subscription->getFilter('contain'), true); if ($containTest) { $this->addRelevancy(); } else { $this->logger('Document not contain keywords', 'info', 5); } } //Bad words if (is_null($this->subscription->getFilter('notContain'))) { $this->addRelevancy(); $this->logger('ignore Bad keywords filter', 'info', 5); } else { //Not Contain? $this->logger('Check for BAD keywords[' . implode(',', $this->subscription->getFilter('notContain')) . ']', 'info', 5); if (!SpiderAsserts::containKeywords($txt, $this->subscription->getFilter('notContain'), false)) { $this->addRelevancy(); } else { $this->logger('Document contain BAD keywords', 'info', 5); } } }
protected function collect(InterfaceLink $target, $withLinks = false) { $URI = $target->getHref(); $type = 'html'; if ($target instanceof InterfaceSubscription) { $this->logger("Subscription Type: " . $target->getSourceType()); $type = $target->getSourceType(); } try { if (!SpiderAsserts::isDocumentHref($URI)) { $this->logger('URI wrong:[' . $URI . ']', 'err', 3); $this->pool->errLink($target, 'invalid URL'); return false; } // verify that this has been processed if (!$target instanceof InterfaceSubscription && $this->isDone($URI)) { $this->logger('URI is Done:[' . $URI . ']', 'info', 1); return false; } try { $crawler = $this->getCrawler($URI, 'GET', $type); } catch (\Exception $e) { $this->logger('Collect Exception', 'err', 3); $this->logger($e->getMessage(), 'err', 3); if ($this->requests === 0) { $this->errors++; throw new \Exception('Error in the first request:' . $e->getMessage()); } } if (!isset($crawler)) { $this->logger('Crawler broken', 'err'); $this->pool->errLink($target, 'impossible crawler'); return false; } if (!$target instanceof InterfaceSubscription) { if (DocumentManager::isFresh($this->getBody(), $target, $this->getSubscription())) { $target->setDocument($this->getCurrentUri(), clone $crawler, $this->getSubscription(), $this->transferDependency()); $this->logger('document IS fresh', 'info', 5); } else { $this->logger('document isnt fresh'); } } $target->setStatus(1); //done! if ($withLinks) { $this->logger('go to the scan more links!', 'info', 5); try { $target->set('hyperlinks', $this->collectLinks($crawler, $type)); } catch (\Exception $e) { $this->logger($e->getMessage(), 'err'); $this->errors++; } } $this->logger('saving object on cache, with id:' . $target->getId('string'), 'info', 5); $this->pool->save($target); $this->success++; return true; } catch (\Zend\Http\Exception\InvalidArgumentException $e) { $this->logger('Invalid argument on [' . $URI . ']', 'err'); $this->pool->errLink($target, 'invalid argument on HTTP request'); $this->errors++; throw new \Exception('Invalid argument'); } catch (\Zend\Http\Client\Adapter\Exception\RuntimeException $e) { $this->logger('Http Client Runtime error on [' . $URI . ']', 'err'); $this->pool->errLink($target, 'Runtime error on Http Client Adaper'); $this->errors++; return false; } }