public function testHtmlNoFollowLinks() { $html = '<HTML>' . '<HEAD><TITLE>Page title</TITLE></HEAD>' . '<BODY>' . 'Document body.' . '<a href="link1.html">Link 1</a>.' . '<a href="link2.html" rel="nofollow">Link 1</a>.' . '</BODY>' . '</HTML>'; $oldNoFollowValue = Zend_Search_Lucene_Document_Html::getExcludeNoFollowLinks(); Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(false); $doc1 = Zend_Search_Lucene_Document_Html::loadHTML($html); $this->assertTrue($doc1 instanceof Zend_Search_Lucene_Document_Html); $this->assertTrue(array_values($doc1->getLinks()) == array('link1.html', 'link2.html')); Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true); $doc2 = Zend_Search_Lucene_Document_Html::loadHTML($html); $this->assertTrue($doc2 instanceof Zend_Search_Lucene_Document_Html); $this->assertTrue(array_values($doc2->getLinks()) == array('link1.html')); }
/** * @param string $link * @param Zend_Http_Response $response * @param string $host * @param string $protocol * @param Zend_Http_CookieJar * @param integer $depth * @return boolean */ protected function parseHtml($link, $response, $host, $protocol, $cookieJar, $depth) { $html = $response->getBody(); $canonicalLink = $this->checkForCanonical($html); if ($canonicalLink and $canonicalLink != $link) { $this->processFoundLink($canonicalLink, $protocol, $host, $link, $depth, $cookieJar); logger::debug(get_class($this) . ": Stopping to parse html at [ {$link} ], processing canonical link [ {$canonicalLink} ] instead"); return true; } //TODO: robots.txt /* //legacy ... $links = array(); preg_match_all('/href=[\'"]+?\s*(?P<link>\S+)\s*[\'"]+?/', $html, $links); $links = $links['link']; */ Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true); $doc = Zend_Search_Lucene_Document_Html::loadHTML($html, false, "utf-8"); $links = $doc->getLinks(); $robotsMeta = $this->getRobotsMetaInfo($html); if (in_array("nofollow", $robotsMeta)) { //no links to follow $links = array(); logger::debug(get_class($this) . ": not following links on [ {$link} ] because it has robots nofollow"); } if (!in_array("noindex", $robotsMeta)) { //now limit to search content area if indicators are set and found in this document if (!empty($this->searchStartIndicator)) { $documentHasDelimiter = strpos($html, $this->searchStartIndicator) !== FALSE; } if ($documentHasDelimiter and !empty($this->searchStartIndicator) and !empty($this->searchEndIndicator)) { //get part before html head starts $top = explode("<head>", $html); //get html head $htmlHead = array(); preg_match_all('@(<head[^>]*?>.*?</head>)@si', $html, $htmlHead); $head = $top[0] . "<head></head>"; if (is_array($htmlHead[0])) { $head = $top[0] . $htmlHead[0][0]; } //get snippets within allowed content areas $htmlSnippets = array(); $minified = str_replace(array("\r\n", "\r", "\n"), "", $html); $minified = preg_replace('@[ \\t\\n\\r\\f]+@', " ", $minified); preg_match_all('%' . $this->searchStartIndicator . '(.*?)' . $this->searchEndIndicator . '%si', $minified, $htmlSnippets); $html = $head; if (is_array($htmlSnippets[0])) { foreach ($htmlSnippets[0] as $snippet) { $html .= " " . $snippet; } } //close html tag $html .= "</html>"; } $this->addHtmlToIndex($html, $link, $this->getLanguageFromResponse($response), $this->getEncodingFromResponse($response), $host); logger::info(get_class($this) . ": Added to indexer stack [ {$link} ]"); } else { $this->addNoIndexPage($link); logger::debug(get_class($this) . ": not indexing [ {$link} ] because it has robots noindex"); } if (count($links) > 0) { foreach ($links as $foundLink) { $this->processFoundLink($foundLink, $protocol, $host, $link, $depth, $cookieJar); } } else { logger::debug(get_class($this) . ": No links found on page at [ {$link} ] "); } //TODO: for now we always return true - as success ... are there any unsuccessful states? return true; }
public function __construct() { Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true); $this->_client = new Zend_Http_Client(); $this->_client->setConfig(array('timeout' => 10, 'keepalive' => true)); }
private function parseHtml($link, $response, $host) { $resource = $response->getResponse(); $crawler = $response->getCrawler(); $html = $resource->getBody(); $language = $this->getLanguageFromResponse($resource, $html); $encoding = $this->getEncodingFromResponse($resource, $html); //page has canonical link: do not track! $hasCanonicalLink = $crawler->filterXpath('//link[@rel="canonical"]')->count() > 0; if ($hasCanonicalLink === TRUE) { \Pimcore\Logger::debug('LuceneSearch: not indexing [ ' . $link . ' ] because it has canonical links'); return FALSE; } //page has no follow: do not track! $hasNoFollow = $crawler->filterXpath('//meta[@content="nofollow"]')->count() > 0; if ($hasNoFollow === TRUE) { \Pimcore\Logger::debug('LuceneSearch: not indexing [ ' . $link . ' ] because it has robots noindex'); return FALSE; } $hasCountryMeta = $crawler->filterXpath('//meta[@name="country"]')->count() > 0; $hasTitle = $response->getCrawler()->filterXpath('//title')->count() > 0; $hasDescription = $response->getCrawler()->filterXpath('//meta[@name="description"]')->count() > 0; $hasRestriction = $response->getCrawler()->filterXpath('//meta[@name="m:groups"]')->count() > 0; $country = FALSE; if ($hasCountryMeta === TRUE) { $country = $crawler->filterXpath('//meta[@name="country"]')->attr('content'); } $title = ''; $description = ''; if ($hasTitle === TRUE) { $title = $response->getCrawler()->filterXpath('//title')->text(); } if ($hasDescription === TRUE) { $description = $response->getCrawler()->filterXpath('//meta[@name="description"]')->attr('content'); } $restrictions = FALSE; if ($hasRestriction === TRUE) { $restrictions = $crawler->filterXpath('//meta[@name="m:groups"]')->attr('content'); } \Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true); $documentHasDelimiter = FALSE; $documentHasExcludeDelimiter = FALSE; //now limit to search content area if indicators are set and found in this document if (!empty($this->searchStartIndicator)) { $documentHasDelimiter = strpos($html, $this->searchStartIndicator) !== FALSE; } //remove content between exclude indicators if (!empty($this->searchExcludeStartIndicator)) { $documentHasExcludeDelimiter = strpos($html, $this->searchExcludeStartIndicator) !== FALSE; } if ($documentHasDelimiter && !empty($this->searchStartIndicator) && !empty($this->searchEndIndicator)) { preg_match_all('%' . $this->searchStartIndicator . '(.*?)' . $this->searchEndIndicator . '%si', $html, $htmlSnippets); $html = ''; if (is_array($htmlSnippets[1])) { foreach ($htmlSnippets[1] as $snippet) { if ($documentHasExcludeDelimiter && !empty($this->searchExcludeStartIndicator) && !empty($this->searchExcludeEndIndicator)) { $snippet = preg_replace('#(' . preg_quote($this->searchExcludeStartIndicator) . ')(.*?)(' . preg_quote($this->searchExcludeEndIndicator) . ')#si', ' ', $snippet); } $html .= ' ' . $snippet; } } } $this->addHtmlToIndex($html, $title, $description, $link, $language, $country, $restrictions, $encoding, $host); \Pimcore\Logger::debug('LuceneSearch: Added to indexer stack [ ' . $link . ' ]'); return TRUE; }