/** * Method to parse input and extract the plain text. Because this method is * called from both inside and outside the indexer, it needs to be able to * batch out its parsing functionality to deal with the inefficiencies of * regular expressions. We will parse recursively in 2KB chunks. * * @param string $input The input to parse. * * @return string The plain text input. * * @since 2.5 */ public function parse($input) { // Strip invalid UTF-8 characters. $input = iconv("utf-8", "utf-8//IGNORE", $input); // Convert <style>, <noscript> and <head> tags to <script> tags // so we can remove them efficiently. $search = array('<style', '</style', '<noscript', '</noscript', '<head', '</head'); $replace = array('<script', '</script', '<script', '</script', '<script', '</script'); $input = str_replace($search, $replace, $input); // Strip all script blocks. $input = $this->removeBlocks($input, '<script', '</script>'); // Decode HTML entities. $input = html_entity_decode($input, ENT_QUOTES, 'UTF-8'); // Convert entities equivalent to spaces to actual spaces. $input = str_replace(array(' ', ' '), ' ', $input); // This fixes issues such as '<h1>Title</h1><p>Paragraph</p>' // being transformed into 'TitleParagraph' with no space. $input = str_replace('>', '> ', $input); // Strip HTML tags. $input = strip_tags($input); return parent::parse($input); }