public function handle(Spizer_Document $doc) { if (!$doc instanceof Spizer_Document_Html) { return; } $headers = $doc->getAllHeaders(); //response is already decoded. unset($headers['transfer-encoding']); unset($headers['content-encoding']); try { $results = $this->scraper->scrape(new Zend_Http_Response($doc->getStatus(), $headers, $doc->getBody()), $doc->getUrl()); } catch (Diggin_Scraper_Exception $dse) { if (isset($this->_config['throwIfNotfound'])) { throw $dse; } } if (!isset($results)) { return; } if ($this->_config['debug']) { echo 'count scrape results ' . count($results['kumo']) . PHP_EOL; } foreach ($results['kumo'] as $src) { $this->send($src); } }
public function handle(Spizer_Document $document) { //check document is image $filepath = $this->_config['save_dir'] . DIRECTORY_SEPARATOR . rawurlencode($document->getUrl()); file_put_contents($filepath, $document->getBody()); $this->addHaveFiles($document->getUrl()); }
public function handle(Spizer_Document $document) { //get! encoding_to_utf-8 $body = Diggin_Http_Response_Encoding::encode($document->getBody(), $document->getHeader('content-type')); if (preg_match($this->_config['match'], $body, $m, PREG_OFFSET_CAPTURE)) { $this->engine->log('RegexMatch', array('message' => 'Document body matched lookup expression', 'regex' => $this->_config['match'], 'match' => $m[0][0], 'offset' => $m[0][1])); } }
/** * Handle incoming documents * * @param Spizer_Document $document * @see Spizer_Handler_Abstract::handle() */ public function handle(Spizer_Document $document) { $strpos = $this->_config['matchcase'] ? 'strpos' : 'stripos'; $body = $document->getBody(); if (($pos = $strpos($body, $this->_config['match'])) !== false) { $this->_log(array('message' => 'Document body matched lookup string', 'needle' => $this->_config['match'], 'offset' => $pos)); } }
public function handle(Spizer_Document $document) { //check document is image $content_type = $document->getHeader('content-type'); if (!preg_match('#image/.*#i', $content_type)) { return; } $filepath = $this->_config['save_dir'] . DIRECTORY_SEPARATOR . rawurlencode($document->getUrl()); file_put_contents($filepath, $document->getBody(), FILE_BINARY); $this->addHaveFiles($document->getUrl()); }
public function handle(Spizer_Document $doc) { if ($this->_callonce) { if (true === $this->_callonce) { $this->_callonce = 1; } elseif (1 == $this->_callonce) { return; } } //var_dump(__METHOD__); if (!$doc instanceof Spizer_Document_Html) { return; } $headers = $doc->getAllHeaders(); //response is already decoded. unset($headers['transfer-encoding']); unset($headers['content-encoding']); $results = $this->scraper->scrape(new Zend_Http_Response($doc->getStatus(), $headers, $doc->getBody()), $doc->getUrl()); $this->_addQueue($results['kumo']); }
/** * Handle document * * @param Spizer_Document $document */ public function handle(Spizer_Document $document) { $string = "{$document->getUrl()} {$document->getStatus()} " . strlen($document->getBody()); // Decide log level according to status code switch (round($document->getStatus() / 100)) { case 1: case 2: $level = Zend_Log::INFO; break; case 3: $level = Zend_Log::NOTICE; break; case 4: $level = Zend_Log::WARN; break; case 5: $level = Zend_Log::ERR; break; } $this->_logger->log($string, $level); }
public function handle(Spizer_Document $doc) { //$this->debug('********START****'); if (!$doc instanceof Spizer_Document_Html) { return; } $headers = $doc->getAllHeaders(); //response is already decoded. unset($headers['transfer-encoding']); unset($headers['content-encoding']); $results = $this->scraper->scrape(new Zend_Http_Response($doc->getStatus(), $headers, $doc->getBody()), $doc->getUrl()); //$this->debug($results); $targets = $this->filter(array_unique($results['kumo'])); foreach ($targets as $src) { //$request = new Spizer_Request($src); $request = new Kumo_Request($src); $request->setReferrer($doc->getUrl()); //if ($this->_config['referer'] === true) { $request->setHeader('Referer', $this->toRefererUrl($doc->getUrl())); //} $this->send($request); } }
/** * Handle incoming documents * * @param Spizer_Document $document * @see Spizer_Handler_Abstract::handle() */ public function handle(Spizer_Document $document) { if (preg_match($this->_config['match'], $document->getBody(), $m, PREG_OFFSET_CAPTURE)) { $this->_log(array('message' => 'Document body matched lookup expression', 'regex' => $this->_config['match'], 'match' => $m[0][0], 'offset' => $m[0][1])); } }