/** * Verifica se o documento é novo * @todo implementar get Raw */ public static function isFresh($body, InterfaceLink $link, InterfaceSubscription $subscription) { if ($existent = $subscription->getLink($link->getId())) { if (SpiderText::diffPercentage($existent, $this->getBody()) < $this->getConfig('requirement_diff', 40)) { return false; } } return true; }
protected function collect(InterfaceLink $target, $withLinks = false) { $URI = $target->getHref(); $type = 'html'; if ($target instanceof InterfaceSubscription) { $this->logger("Subscription Type: " . $target->getSourceType()); $type = $target->getSourceType(); } try { if (!SpiderAsserts::isDocumentHref($URI)) { $this->logger('URI wrong:[' . $URI . ']', 'err', 3); $this->pool->errLink($target, 'invalid URL'); return false; } // verify that this has been processed if (!$target instanceof InterfaceSubscription && $this->isDone($URI)) { $this->logger('URI is Done:[' . $URI . ']', 'info', 1); return false; } try { $crawler = $this->getCrawler($URI, 'GET', $type); } catch (\Exception $e) { $this->logger('Collect Exception', 'err', 3); $this->logger($e->getMessage(), 'err', 3); if ($this->requests === 0) { $this->errors++; throw new \Exception('Error in the first request:' . $e->getMessage()); } } if (!isset($crawler)) { $this->logger('Crawler broken', 'err'); $this->pool->errLink($target, 'impossible crawler'); return false; } if (!$target instanceof InterfaceSubscription) { if (DocumentManager::isFresh($this->getBody(), $target, $this->getSubscription())) { $target->setDocument($this->getCurrentUri(), clone $crawler, $this->getSubscription(), $this->transferDependency()); $this->logger('document IS fresh', 'info', 5); } else { $this->logger('document isnt fresh'); } } $target->setStatus(1); //done! if ($withLinks) { $this->logger('go to the scan more links!', 'info', 5); try { $target->set('hyperlinks', $this->collectLinks($crawler, $type)); } catch (\Exception $e) { $this->logger($e->getMessage(), 'err'); $this->errors++; } } $this->logger('saving object on cache, with id:' . $target->getId('string'), 'info', 5); $this->pool->save($target); $this->success++; return true; } catch (\Zend\Http\Exception\InvalidArgumentException $e) { $this->logger('Invalid argument on [' . $URI . ']', 'err'); $this->pool->errLink($target, 'invalid argument on HTTP request'); $this->errors++; throw new \Exception('Invalid argument'); } catch (\Zend\Http\Client\Adapter\Exception\RuntimeException $e) { $this->logger('Http Client Runtime error on [' . $URI . ']', 'err'); $this->pool->errLink($target, 'Runtime error on Http Client Adaper'); $this->errors++; return false; } }
public function errLink(InterfaceLink $link, $cause = 'undefined') { $link->setStatus(3); $this->_save($link); $this->logger($link->get('href') . "\n" . ' marked with error.' . 'Cause: ' . $cause . "\n"); $this->errors++; }
public static function isDocumentLink(InterfaceLink $link) { return self::isDocumentHref($link->getHref()); }