示例#1
0
 /**
  * Method to parse input and extract the plain text. Because this method is
  * called from both inside and outside the indexer, it needs to be able to
  * batch out its parsing functionality to deal with the inefficiencies of
  * regular expressions. We will parse recursively in 2KB chunks.
  *
  * @param   string  $input  The input to parse.
  *
  * @return  string  The plain text input.
  *
  * @since   2.5
  */
 public function parse($input)
 {
     // Strip invalid UTF-8 characters.
     $input = iconv("utf-8", "utf-8//IGNORE", $input);
     // Convert <style>, <noscript> and <head> tags to <script> tags
     // so we can remove them efficiently.
     $search = array('<style', '</style', '<noscript', '</noscript', '<head', '</head');
     $replace = array('<script', '</script', '<script', '</script', '<script', '</script');
     $input = str_replace($search, $replace, $input);
     // Strip all script blocks.
     $input = $this->removeBlocks($input, '<script', '</script>');
     // Decode HTML entities.
     $input = html_entity_decode($input, ENT_QUOTES, 'UTF-8');
     // Convert entities equivalent to spaces to actual spaces.
     $input = str_replace(array('&nbsp;', '&#160;'), ' ', $input);
     // This fixes issues such as '<h1>Title</h1><p>Paragraph</p>'
     // being transformed into 'TitleParagraph' with no space.
     $input = str_replace('>', '> ', $input);
     // Strip HTML tags.
     $input = strip_tags($input);
     return parent::parse($input);
 }
 /**
  * Method to parse input into plain text.
  *
  * @param   string  $input   The raw input.
  * @param   string  $format  The format of the input. [optional]
  *
  * @return  string  The parsed input.
  *
  * @since   2.5
  * @throws  Exception on invalid parser.
  */
 public static function parse($input, $format = 'html')
 {
     // Get a parser for the specified format and parse the input.
     return FinderIndexerParser::getInstance($format)->parse($input);
 }
 /**
  * Tests the getInstance method with a non-existing parser
  *
  * @return  void
  *
  * @since   3.0
  *
  * @expectedException  Exception
  */
 public function testGetInstance_noParser()
 {
     FinderIndexerParser::getInstance('noway');
 }