Exemplo n.º 1
0
 public function testHtmlNoFollowLinks()
 {
     $html = '<HTML>' . '<HEAD><TITLE>Page title</TITLE></HEAD>' . '<BODY>' . 'Document body.' . '<a href="link1.html">Link 1</a>.' . '<a href="link2.html" rel="nofollow">Link 1</a>.' . '</BODY>' . '</HTML>';
     $oldNoFollowValue = Zend_Search_Lucene_Document_Html::getExcludeNoFollowLinks();
     Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(false);
     $doc1 = Zend_Search_Lucene_Document_Html::loadHTML($html);
     $this->assertTrue($doc1 instanceof Zend_Search_Lucene_Document_Html);
     $this->assertTrue(array_values($doc1->getLinks()) == array('link1.html', 'link2.html'));
     Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true);
     $doc2 = Zend_Search_Lucene_Document_Html::loadHTML($html);
     $this->assertTrue($doc2 instanceof Zend_Search_Lucene_Document_Html);
     $this->assertTrue(array_values($doc2->getLinks()) == array('link1.html'));
 }
Exemplo n.º 2
0
 /**
  * @param  string $link
  * @param  Zend_Http_Response $response
  * @param string $host
  * @param string $protocol
  * @param Zend_Http_CookieJar
  * @param integer $depth
  * @return boolean
  */
 protected function parseHtml($link, $response, $host, $protocol, $cookieJar, $depth)
 {
     $html = $response->getBody();
     $canonicalLink = $this->checkForCanonical($html);
     if ($canonicalLink and $canonicalLink != $link) {
         $this->processFoundLink($canonicalLink, $protocol, $host, $link, $depth, $cookieJar);
         logger::debug(get_class($this) . ": Stopping to parse html at [ {$link} ], processing canonical link [ {$canonicalLink} ] instead");
         return true;
     }
     //TODO: robots.txt
     /*
             //legacy ...
             $links = array();
             preg_match_all('/href=[\'"]+?\s*(?P<link>\S+)\s*[\'"]+?/', $html, $links);
     $links = $links['link'];
     */
     Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true);
     $doc = Zend_Search_Lucene_Document_Html::loadHTML($html, false, "utf-8");
     $links = $doc->getLinks();
     $robotsMeta = $this->getRobotsMetaInfo($html);
     if (in_array("nofollow", $robotsMeta)) {
         //no links to follow
         $links = array();
         logger::debug(get_class($this) . ": not following links on [ {$link} ] because it has robots nofollow");
     }
     if (!in_array("noindex", $robotsMeta)) {
         //now limit to search content area if indicators are set and found in this document
         if (!empty($this->searchStartIndicator)) {
             $documentHasDelimiter = strpos($html, $this->searchStartIndicator) !== FALSE;
         }
         if ($documentHasDelimiter and !empty($this->searchStartIndicator) and !empty($this->searchEndIndicator)) {
             //get part before html head starts
             $top = explode("<head>", $html);
             //get html head
             $htmlHead = array();
             preg_match_all('@(<head[^>]*?>.*?</head>)@si', $html, $htmlHead);
             $head = $top[0] . "<head></head>";
             if (is_array($htmlHead[0])) {
                 $head = $top[0] . $htmlHead[0][0];
             }
             //get snippets within allowed content areas
             $htmlSnippets = array();
             $minified = str_replace(array("\r\n", "\r", "\n"), "", $html);
             $minified = preg_replace('@[ \\t\\n\\r\\f]+@', " ", $minified);
             preg_match_all('%' . $this->searchStartIndicator . '(.*?)' . $this->searchEndIndicator . '%si', $minified, $htmlSnippets);
             $html = $head;
             if (is_array($htmlSnippets[0])) {
                 foreach ($htmlSnippets[0] as $snippet) {
                     $html .= " " . $snippet;
                 }
             }
             //close html tag
             $html .= "</html>";
         }
         $this->addHtmlToIndex($html, $link, $this->getLanguageFromResponse($response), $this->getEncodingFromResponse($response), $host);
         logger::info(get_class($this) . ": Added to indexer stack [ {$link} ]");
     } else {
         $this->addNoIndexPage($link);
         logger::debug(get_class($this) . ": not indexing [ {$link} ] because it has robots noindex");
     }
     if (count($links) > 0) {
         foreach ($links as $foundLink) {
             $this->processFoundLink($foundLink, $protocol, $host, $link, $depth, $cookieJar);
         }
     } else {
         logger::debug(get_class($this) . ": No links found on page at [ {$link} ] ");
     }
     //TODO: for now we always return true - as success ... are there any unsuccessful states?
     return true;
 }
Exemplo n.º 3
0
 public function __construct()
 {
     Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true);
     $this->_client = new Zend_Http_Client();
     $this->_client->setConfig(array('timeout' => 10, 'keepalive' => true));
 }
 private function parseHtml($link, $response, $host)
 {
     $resource = $response->getResponse();
     $crawler = $response->getCrawler();
     $html = $resource->getBody();
     $language = $this->getLanguageFromResponse($resource, $html);
     $encoding = $this->getEncodingFromResponse($resource, $html);
     //page has canonical link: do not track!
     $hasCanonicalLink = $crawler->filterXpath('//link[@rel="canonical"]')->count() > 0;
     if ($hasCanonicalLink === TRUE) {
         \Pimcore\Logger::debug('LuceneSearch: not indexing [ ' . $link . ' ] because it has canonical links');
         return FALSE;
     }
     //page has no follow: do not track!
     $hasNoFollow = $crawler->filterXpath('//meta[@content="nofollow"]')->count() > 0;
     if ($hasNoFollow === TRUE) {
         \Pimcore\Logger::debug('LuceneSearch: not indexing [ ' . $link . ' ] because it has robots noindex');
         return FALSE;
     }
     $hasCountryMeta = $crawler->filterXpath('//meta[@name="country"]')->count() > 0;
     $hasTitle = $response->getCrawler()->filterXpath('//title')->count() > 0;
     $hasDescription = $response->getCrawler()->filterXpath('//meta[@name="description"]')->count() > 0;
     $hasRestriction = $response->getCrawler()->filterXpath('//meta[@name="m:groups"]')->count() > 0;
     $country = FALSE;
     if ($hasCountryMeta === TRUE) {
         $country = $crawler->filterXpath('//meta[@name="country"]')->attr('content');
     }
     $title = '';
     $description = '';
     if ($hasTitle === TRUE) {
         $title = $response->getCrawler()->filterXpath('//title')->text();
     }
     if ($hasDescription === TRUE) {
         $description = $response->getCrawler()->filterXpath('//meta[@name="description"]')->attr('content');
     }
     $restrictions = FALSE;
     if ($hasRestriction === TRUE) {
         $restrictions = $crawler->filterXpath('//meta[@name="m:groups"]')->attr('content');
     }
     \Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true);
     $documentHasDelimiter = FALSE;
     $documentHasExcludeDelimiter = FALSE;
     //now limit to search content area if indicators are set and found in this document
     if (!empty($this->searchStartIndicator)) {
         $documentHasDelimiter = strpos($html, $this->searchStartIndicator) !== FALSE;
     }
     //remove content between exclude indicators
     if (!empty($this->searchExcludeStartIndicator)) {
         $documentHasExcludeDelimiter = strpos($html, $this->searchExcludeStartIndicator) !== FALSE;
     }
     if ($documentHasDelimiter && !empty($this->searchStartIndicator) && !empty($this->searchEndIndicator)) {
         preg_match_all('%' . $this->searchStartIndicator . '(.*?)' . $this->searchEndIndicator . '%si', $html, $htmlSnippets);
         $html = '';
         if (is_array($htmlSnippets[1])) {
             foreach ($htmlSnippets[1] as $snippet) {
                 if ($documentHasExcludeDelimiter && !empty($this->searchExcludeStartIndicator) && !empty($this->searchExcludeEndIndicator)) {
                     $snippet = preg_replace('#(' . preg_quote($this->searchExcludeStartIndicator) . ')(.*?)(' . preg_quote($this->searchExcludeEndIndicator) . ')#si', ' ', $snippet);
                 }
                 $html .= ' ' . $snippet;
             }
         }
     }
     $this->addHtmlToIndex($html, $title, $description, $link, $language, $country, $restrictions, $encoding, $host);
     \Pimcore\Logger::debug('LuceneSearch: Added to indexer stack [ ' . $link . ' ]');
     return TRUE;
 }