public function handle(Spizer_Document $doc)
 {
     if (!$doc instanceof Spizer_Document_Html) {
         return;
     }
     $headers = $doc->getAllHeaders();
     //response is already decoded.
     unset($headers['transfer-encoding']);
     unset($headers['content-encoding']);
     try {
         $results = $this->scraper->scrape(new Zend_Http_Response($doc->getStatus(), $headers, $doc->getBody()), $doc->getUrl());
     } catch (Diggin_Scraper_Exception $dse) {
         if (isset($this->_config['throwIfNotfound'])) {
             throw $dse;
         }
     }
     if (!isset($results)) {
         return;
     }
     if ($this->_config['debug']) {
         echo 'count scrape results ' . count($results['kumo']) . PHP_EOL;
     }
     foreach ($results['kumo'] as $src) {
         $this->send($src);
     }
 }
Exemple #2
0
 public function handle(Spizer_Document $document)
 {
     //check document is image
     $filepath = $this->_config['save_dir'] . DIRECTORY_SEPARATOR . rawurlencode($document->getUrl());
     file_put_contents($filepath, $document->getBody());
     $this->addHaveFiles($document->getUrl());
 }
Exemple #3
0
 public function handle(Spizer_Document $document)
 {
     //get! encoding_to_utf-8
     $body = Diggin_Http_Response_Encoding::encode($document->getBody(), $document->getHeader('content-type'));
     if (preg_match($this->_config['match'], $body, $m, PREG_OFFSET_CAPTURE)) {
         $this->engine->log('RegexMatch', array('message' => 'Document body matched lookup expression', 'regex' => $this->_config['match'], 'match' => $m[0][0], 'offset' => $m[0][1]));
     }
 }
 /**
  * Handle incoming documents
  * 
  * @param Spizer_Document $document 
  * @see   Spizer_Handler_Abstract::handle()
  */
 public function handle(Spizer_Document $document)
 {
     $strpos = $this->_config['matchcase'] ? 'strpos' : 'stripos';
     $body = $document->getBody();
     if (($pos = $strpos($body, $this->_config['match'])) !== false) {
         $this->_log(array('message' => 'Document body matched lookup string', 'needle' => $this->_config['match'], 'offset' => $pos));
     }
 }
Exemple #5
0
 public function handle(Spizer_Document $document)
 {
     //check document is image
     $content_type = $document->getHeader('content-type');
     if (!preg_match('#image/.*#i', $content_type)) {
         return;
     }
     $filepath = $this->_config['save_dir'] . DIRECTORY_SEPARATOR . rawurlencode($document->getUrl());
     file_put_contents($filepath, $document->getBody(), FILE_BINARY);
     $this->addHaveFiles($document->getUrl());
 }
Exemple #6
0
 /**
  * Handle incoming documents
  * 
  * @param Spizer_Document_Html $document 
  * @see   Spizer_Handler_Abstract::handle()
  */
 public function handle(Spizer_Document $document)
 {
     // Silently ignore non-HTML documents
     if (!$document instanceof Spizer_Document_Html) {
         return;
     }
     $images = $document->getImages();
     foreach ($images as $img) {
         $img = basename($img);
         if (strpos($img, '_') !== false) {
             // Image file name contains underscore
             $this->engine->log('ZendImages', array('message' => 'Image contains underscore in it\'s file name', 'src' => $img));
         }
     }
 }
 /**
  * Check if the handler actually needs to be called (according to it's
  * content type and status code), and if so call ::handle()
  *
  * @param Spizer_Document $document
  */
 public function call(Spizer_Document $document)
 {
     $status = $document->getStatus();
     $type = $document->getHeader('content-type');
     $call = true;
     if ($this->_config['status']) {
         if (is_array($this->_config['status'])) {
             if (!in_array($status, $this->_config['status'])) {
                 $call = false;
             }
         } elseif ($this->_config['status'] != $status) {
             $call = false;
         }
     }
     if ($this->_config['content-type']) {
         if (is_array($this->_config['content-type'])) {
             if (!in_array($type, $this->_config['content-type'])) {
                 $call = false;
             }
         } elseif ($this->_config['content-type'] != $type) {
             $call = false;
         }
     }
     if ($call) {
         $this->handle($document);
     }
 }
 /**
  * Handle incoming documents
  * 
  * @param Spizer_Document_Xml $document 
  * @see   Spizer_Handler_Abstract::handle()
  */
 public function handle(Spizer_Document $document)
 {
     // Silently ignore non-XML documents
     if (!$document instanceof Spizer_Document_Xml) {
         return;
     }
     $query = $this->_config['query'];
     $tags = $document->getXpath()->query($query);
     if ($tags->length == 0) {
         $data = array('query' => $query);
         if (isset($this->_config['message'])) {
             $data['message'] = $this->_config['message'];
         }
         $this->_log($data);
     }
 }
Exemple #9
0
 protected function __construct($url, $status, array $headers, $body)
 {
     parent::__construct($url, $status, $headers, $body);
     $this->_domDocument = new DOMDocument();
     $this->_domDocument->preserveWhiteSpace = true;
     // We have to silence this out because invalid documents
     // tend to throw allot of warnings
     @$this->_domDocument->loadHtml($body);
 }
Exemple #10
0
 public function handle(Spizer_Document $doc)
 {
     // Silently skip all non-HTML documents
     if (!$doc instanceof Spizer_Document_Html) {
         return;
     }
     // Add document URL to the list of visited pages
     $baseUrl = $doc->getUrl();
     if (!in_array($baseUrl, $this->targets)) {
         $this->targets[] = $baseUrl;
     }
     $pagerize = new Diggin_Scraper_Helper_Simplexml_Pagerize(simplexml_import_dom($doc->getDomDocument()), array('baseUrl' => $this->toUrl($doc->getUrl())));
     if ($nextLink = $pagerize->getNextLink()) {
         $max_follow = $this->_config['max_follow'];
         if (!$max_follow or $this->page_count <= $max_follow) {
             $this->addToQueue($nextLink, $baseUrl);
             ++$this->page_count;
         }
     }
 }
 public function handle(Spizer_Document $doc)
 {
     if ($this->_callonce) {
         if (true === $this->_callonce) {
             $this->_callonce = 1;
         } elseif (1 == $this->_callonce) {
             return;
         }
     }
     //var_dump(__METHOD__);
     if (!$doc instanceof Spizer_Document_Html) {
         return;
     }
     $headers = $doc->getAllHeaders();
     //response is already decoded.
     unset($headers['transfer-encoding']);
     unset($headers['content-encoding']);
     $results = $this->scraper->scrape(new Zend_Http_Response($doc->getStatus(), $headers, $doc->getBody()), $doc->getUrl());
     $this->_addQueue($results['kumo']);
 }
 /**
  * Handle document
  *
  * @param Spizer_Document $document
  */
 public function handle(Spizer_Document $document)
 {
     $string = "{$document->getUrl()} {$document->getStatus()} " . strlen($document->getBody());
     // Decide log level according to status code
     switch (round($document->getStatus() / 100)) {
         case 1:
         case 2:
             $level = Zend_Log::INFO;
             break;
         case 3:
             $level = Zend_Log::NOTICE;
             break;
         case 4:
             $level = Zend_Log::WARN;
             break;
         case 5:
             $level = Zend_Log::ERR;
             break;
     }
     $this->_logger->log($string, $level);
 }
Exemple #13
0
 /**
  * Handle incoming documents
  * 
  * @param Spizer_Document_Xml $document 
  * @see   Spizer_Handler_Abstract::handle()
  */
 public function handle(Spizer_Document $document)
 {
     // Silently ignore non-XML documents
     if (!$document instanceof Spizer_Document_Xml) {
         return;
     }
     $query = $this->_config['query'];
     $tags = $document->getXpath()->query($query);
     if ($tags instanceof DOMNodeList) {
         foreach ($tags as $tag) {
             $data = array('query' => $query);
             if (isset($this->_config['message'])) {
                 $data['message'] = $this->_config['message'];
             }
             if (isset($this->_config['captureValue'])) {
                 $value = $document->getXpath()->evaluate($this->_config['captureValue'], $tag);
                 if ($value) {
                     $data['captureValue'] = (string) $value;
                 }
             }
             $this->_log($data);
         }
     }
 }
 public function handle(Spizer_Document $doc)
 {
     //$this->debug('********START****');
     if (!$doc instanceof Spizer_Document_Html) {
         return;
     }
     $headers = $doc->getAllHeaders();
     //response is already decoded.
     unset($headers['transfer-encoding']);
     unset($headers['content-encoding']);
     $results = $this->scraper->scrape(new Zend_Http_Response($doc->getStatus(), $headers, $doc->getBody()), $doc->getUrl());
     //$this->debug($results);
     $targets = $this->filter(array_unique($results['kumo']));
     foreach ($targets as $src) {
         //$request = new Spizer_Request($src);
         $request = new Kumo_Request($src);
         $request->setReferrer($doc->getUrl());
         //if ($this->_config['referer'] === true) {
         $request->setHeader('Referer', $this->toRefererUrl($doc->getUrl()));
         //}
         $this->send($request);
     }
 }
 /**
  * Handle document - fetch links out of the document and add them to the 
  * queue
  *
  * @param Spizer_Document_Html $doc
  */
 public function handle(Spizer_Document $doc)
 {
     // If need, set the match domain according to the first URL
     if (!isset($this->_config['domain']) && $this->_config['samedomain']) {
         $this->_config['domain'] = $this->_engine->getBaseUri()->getHost();
     }
     // Add document URL to the list of visited pages
     $baseUrl = (string) $doc->getUrl();
     if (!in_array($baseUrl, $this->_targets)) {
         $this->_targets[] = $baseUrl;
     }
     // Silently skip all non-HTML documents
     if (!$doc instanceof Spizer_Document_Html) {
         return;
     }
     // Fetch links out of the document
     $links = array();
     if ($this->_config['followhref']) {
         $links = array_merge($links, $doc->getLinks());
     }
     if ($this->_config['followimg']) {
         $links = array_merge($links, $doc->getImages());
     }
     if ($this->_config['followlink']) {
         $links = array_merge($links, $doc->getHeaderLinks());
     }
     if ($this->_config['followscript']) {
         $links = array_merge($links, $doc->getScriptLinks());
     }
     if ($this->_config['followframes']) {
         $links = array_merge($links, $doc->getFrameLinks());
     }
     // Iterate over all document links
     foreach ($links as $link) {
         // Try to parse URL - if we fail, skip this link (should not happen normally)
         if (!($parts = @parse_url($link))) {
             continue;
         }
         // Skip non-http schemes
         if (isset($parts['scheme']) && ($parts['scheme'] != 'http' && $parts['scheme'] != 'https')) {
             continue;
         }
         // Full URI
         if (isset($parts['host'])) {
             if (preg_match('/' . preg_quote($this->_config['domain']) . '$/', $parts['host'])) {
                 $this->_addToQueue($link, $baseUrl);
             }
             // Partial URI
         } elseif (isset($parts['path'])) {
             try {
                 $linkUri = clone $doc->getUrl();
                 $linkUri->setQuery(isset($parts['query']) ? $parts['query'] : null);
                 $linkUri->getFragment(isset($parts['fragment']) ? $parts['fragment'] : null);
                 // Full absolute path
                 if (substr_compare($parts['path'], '/', 0, 1) == 0) {
                     $linkUri->setPath($parts['path']);
                     // Relative path
                 } else {
                     $basePath = $doc->getUrl()->getPath();
                     $pos = strrpos($basePath, '/');
                     if ($pos === false) {
                         $linkUri->setPath('/' . $parts['path']);
                     } else {
                         $linkUri->setPath(substr($basePath, 0, $pos + 1) . $parts['path']);
                     }
                 }
                 $this->_addToQueue($linkUri, $baseUrl);
                 // If any of the URL parts is invalid, an exception will be caught here
             } catch (Zend_Uri_Exception $e) {
                 $this->_log(array('link' => $link, 'message' => 'Unable to parse link URL: ' . $e->getMessage()));
             }
         }
     }
 }
 /**
  * Handle incoming documents
  * 
  * @param Spizer_Document $document 
  * @see   Spizer_Handler_Abstract::handle()
  */
 public function handle(Spizer_Document $document)
 {
     if (preg_match($this->_config['match'], $document->getBody(), $m, PREG_OFFSET_CAPTURE)) {
         $this->_log(array('message' => 'Document body matched lookup expression', 'regex' => $this->_config['match'], 'match' => $m[0][0], 'offset' => $m[0][1]));
     }
 }
Exemple #17
0
 public function handle(Spizer_Document $doc)
 {
     echo (string) $doc->getUrl(), PHP_EOL;
 }
Exemple #18
0
 /**
  * Call all handlers on document 
  *
  * @param Spizer_Request  $request
  * @param Spizer_Response $response
  */
 protected function callHandlers(Spizer_Request $request, Spizer_Response $response)
 {
     $document = Spizer_Document::factory($request, $response);
     // Run all common handlers
     foreach ($this->_handlers as $handler) {
         $handler->call($document);
     }
 }