Пример #1
0
 /**
  * Handle document - fetch links out of the document and add them to the 
  * queue
  *
  * @param Spizer_Document_Html $doc
  */
 public function handle(Spizer_Document $doc)
 {
     // If need, set the match domain according to the first URL
     if (!isset($this->_config['domain']) && $this->_config['samedomain']) {
         $this->_config['domain'] = $this->_engine->getBaseUri()->getHost();
     }
     // Add document URL to the list of visited pages
     $baseUrl = (string) $doc->getUrl();
     if (!in_array($baseUrl, $this->_targets)) {
         $this->_targets[] = $baseUrl;
     }
     // Silently skip all non-HTML documents
     if (!$doc instanceof Spizer_Document_Html) {
         return;
     }
     // Fetch links out of the document
     $links = array();
     if ($this->_config['followhref']) {
         $links = array_merge($links, $doc->getLinks());
     }
     if ($this->_config['followimg']) {
         $links = array_merge($links, $doc->getImages());
     }
     if ($this->_config['followlink']) {
         $links = array_merge($links, $doc->getHeaderLinks());
     }
     if ($this->_config['followscript']) {
         $links = array_merge($links, $doc->getScriptLinks());
     }
     if ($this->_config['followframes']) {
         $links = array_merge($links, $doc->getFrameLinks());
     }
     // Iterate over all document links
     foreach ($links as $link) {
         // Try to parse URL - if we fail, skip this link (should not happen normally)
         if (!($parts = @parse_url($link))) {
             continue;
         }
         // Skip non-http schemes
         if (isset($parts['scheme']) && ($parts['scheme'] != 'http' && $parts['scheme'] != 'https')) {
             continue;
         }
         // Full URI
         if (isset($parts['host'])) {
             if (preg_match('/' . preg_quote($this->_config['domain']) . '$/', $parts['host'])) {
                 $this->_addToQueue($link, $baseUrl);
             }
             // Partial URI
         } elseif (isset($parts['path'])) {
             try {
                 $linkUri = clone $doc->getUrl();
                 $linkUri->setQuery(isset($parts['query']) ? $parts['query'] : null);
                 $linkUri->getFragment(isset($parts['fragment']) ? $parts['fragment'] : null);
                 // Full absolute path
                 if (substr_compare($parts['path'], '/', 0, 1) == 0) {
                     $linkUri->setPath($parts['path']);
                     // Relative path
                 } else {
                     $basePath = $doc->getUrl()->getPath();
                     $pos = strrpos($basePath, '/');
                     if ($pos === false) {
                         $linkUri->setPath('/' . $parts['path']);
                     } else {
                         $linkUri->setPath(substr($basePath, 0, $pos + 1) . $parts['path']);
                     }
                 }
                 $this->_addToQueue($linkUri, $baseUrl);
                 // If any of the URL parts is invalid, an exception will be caught here
             } catch (Zend_Uri_Exception $e) {
                 $this->_log(array('link' => $link, 'message' => 'Unable to parse link URL: ' . $e->getMessage()));
             }
         }
     }
 }