/** * Exclude some html parts by class inside content wrapped with TYPO3SEARCH_begin and TYPO3SEARCH_end * markers. * * @param string $indexableContent HTML markup * @return string HTML */ public function excludeContentByClass($indexableContent) { if (empty(trim($indexableContent))) { return html_entity_decode($indexableContent); } $excludeClasses = $this->getConfiguration()->getIndexQueuePagesExcludeContentByClassArray(); if (count($excludeClasses) === 0) { return html_entity_decode($indexableContent); } $isInContent = Util::containsOneOfTheStrings($indexableContent, $excludeClasses); if (!$isInContent) { return html_entity_decode($indexableContent); } $doc = new \DOMDocument('1.0', 'UTF-8'); libxml_use_internal_errors(true); $doc->loadHTML('<?xml version="1.0" encoding="UTF-8"?>' . PHP_EOL . $indexableContent); $xpath = new \DOMXPath($doc); foreach ($excludeClasses as $excludePart) { $elements = $xpath->query("//*[contains(@class,'" . $excludePart . "')]"); if (count($elements) == 0) { continue; } foreach ($elements as $element) { $element->parentNode->removeChild($element); } } $html = $doc->saveHTML($doc->documentElement->parentNode); // remove XML-Preamble, newlines and doctype $html = preg_replace('/(<\\?xml[^>]+\\?>|\\r?\\n|<!DOCTYPE.+?>)/imS', '', $html); $html = str_replace(array('<html>', '</html>', '<body>', '</body>'), array('', '', '', ''), $html); return $html; }