/** * Handle document - fetch links out of the document and add them to the * queue * * @param Spizer_Document_Html $doc */ public function handle(Spizer_Document $doc) { // If need, set the match domain according to the first URL if (!isset($this->_config['domain']) && $this->_config['samedomain']) { $this->_config['domain'] = $this->_engine->getBaseUri()->getHost(); } // Add document URL to the list of visited pages $baseUrl = (string) $doc->getUrl(); if (!in_array($baseUrl, $this->_targets)) { $this->_targets[] = $baseUrl; } // Silently skip all non-HTML documents if (!$doc instanceof Spizer_Document_Html) { return; } // Fetch links out of the document $links = array(); if ($this->_config['followhref']) { $links = array_merge($links, $doc->getLinks()); } if ($this->_config['followimg']) { $links = array_merge($links, $doc->getImages()); } if ($this->_config['followlink']) { $links = array_merge($links, $doc->getHeaderLinks()); } if ($this->_config['followscript']) { $links = array_merge($links, $doc->getScriptLinks()); } if ($this->_config['followframes']) { $links = array_merge($links, $doc->getFrameLinks()); } // Iterate over all document links foreach ($links as $link) { // Try to parse URL - if we fail, skip this link (should not happen normally) if (!($parts = @parse_url($link))) { continue; } // Skip non-http schemes if (isset($parts['scheme']) && ($parts['scheme'] != 'http' && $parts['scheme'] != 'https')) { continue; } // Full URI if (isset($parts['host'])) { if (preg_match('/' . preg_quote($this->_config['domain']) . '$/', $parts['host'])) { $this->_addToQueue($link, $baseUrl); } // Partial URI } elseif (isset($parts['path'])) { try { $linkUri = clone $doc->getUrl(); $linkUri->setQuery(isset($parts['query']) ? $parts['query'] : null); $linkUri->getFragment(isset($parts['fragment']) ? $parts['fragment'] : null); // Full absolute path if (substr_compare($parts['path'], '/', 0, 1) == 0) { $linkUri->setPath($parts['path']); // Relative path } else { $basePath = $doc->getUrl()->getPath(); $pos = strrpos($basePath, '/'); if ($pos === false) { $linkUri->setPath('/' . $parts['path']); } else { $linkUri->setPath(substr($basePath, 0, $pos + 1) . $parts['path']); } } $this->_addToQueue($linkUri, $baseUrl); // If any of the URL parts is invalid, an exception will be caught here } catch (Zend_Uri_Exception $e) { $this->_log(array('link' => $link, 'message' => 'Unable to parse link URL: ' . $e->getMessage())); } } } }