/** * Method to parse input and extract the plain text. Because this method is * called from both inside and outside the indexer, it needs to be able to * batch out its parsing functionality to deal with the inefficiencies of * regular expressions. We will parse recursively in 2KB chunks. * * @param string $input The input to parse. * * @return string The plain text input. * * @since 2.5 */ public function parse($input) { // Strip invalid UTF-8 characters. $input = iconv("utf-8", "utf-8//IGNORE", $input); // Convert <style>, <noscript> and <head> tags to <script> tags // so we can remove them efficiently. $search = array('<style', '</style', '<noscript', '</noscript', '<head', '</head'); $replace = array('<script', '</script', '<script', '</script', '<script', '</script'); $input = str_replace($search, $replace, $input); // Strip all script blocks. $input = $this->removeBlocks($input, '<script', '</script>'); // Decode HTML entities. $input = html_entity_decode($input, ENT_QUOTES, 'UTF-8'); // Convert entities equivalent to spaces to actual spaces. $input = str_replace(array(' ', ' '), ' ', $input); // This fixes issues such as '<h1>Title</h1><p>Paragraph</p>' // being transformed into 'TitleParagraph' with no space. $input = str_replace('>', '> ', $input); // Strip HTML tags. $input = strip_tags($input); return parent::parse($input); }
/** * Method to parse input into plain text. * * @param string $input The raw input. * @param string $format The format of the input. [optional] * * @return string The parsed input. * * @since 2.5 * @throws Exception on invalid parser. */ public static function parse($input, $format = 'html') { // Get a parser for the specified format and parse the input. return FinderIndexerParser::getInstance($format)->parse($input); }
/** * Tests the getInstance method with a non-existing parser * * @return void * * @since 3.0 * * @expectedException Exception */ public function testGetInstance_noParser() { FinderIndexerParser::getInstance('noway'); }