Example #1
0
 /**
  *    Turns HTML into text browser visible text. Images
  *    are converted to their alt text and tags are supressed.
  *    Entities are converted to their visible representation.
  *    @param string $html        HTML to convert.
  *    @return string             Plain text.
  *    @access public
  *    @static
  */
 function normalise($html)
 {
     $text = preg_replace('|<img.*?alt\\s*=\\s*"(.*?)".*?>|', ' \\1 ', $html);
     $text = preg_replace('|<img.*?alt\\s*=\\s*\'(.*?)\'.*?>|', ' \\1 ', $text);
     $text = preg_replace('|<img.*?alt\\s*=\\s*([a-zA-Z_]+).*?>|', ' \\1 ', $text);
     $text = preg_replace('|<.*?>|', '', $text);
     $text = SimpleSaxParser::decodeHtml($text);
     $text = preg_replace('|\\s+|', ' ', $text);
     return trim($text);
 }
Example #2
0
 function setUp()
 {
     $this->_handler =& new MockSimpleSaxParser($this);
     $this->_handler->setReturnValue("acceptStartToken", true);
     $this->_handler->setReturnValue("acceptEndToken", true);
     $this->_handler->setReturnValue("acceptAttributeToken", true);
     $this->_handler->setReturnValue("acceptEntityToken", true);
     $this->_handler->setReturnValue("acceptTextToken", true);
     $this->_handler->setReturnValue("ignore", true);
     $this->_lexer =& SimpleSaxParser::createLexer($this->_handler);
 }
Example #3
0
 /**
  *    Reads the raw content and send events
  *    into the page to be built.
  *    @param string $raw                 Unparsed text.
  *    @param SimpleSaxParser $parser     Event generator.
  *    @access public
  */
 function parse($raw, &$parser)
 {
     return $parser->parse($raw);
 }
Example #4
0
 /**
  *    Pattern matches to parse the inside of a tag
  *    including the attributes and their quoting.
  *    @param SimpleLexer $lexer    Lexer to add patterns to.
  *    @access private
  *    @static
  */
 function _addInTagTokens(&$lexer) {
     $lexer->mapHandler('tag', 'acceptStartToken');
     $lexer->addSpecialPattern('\s+', 'tag', 'ignore');
     SimpleSaxParser::_addAttributeTokens($lexer);
     $lexer->addExitPattern('>', 'tag');
 }
Example #5
0
 /**
  *    Accessor for plain text of page as a text browser
  *    would see it.
  *    @return string        Plain text of page.
  *    @access public
  */
 function getText()
 {
     if (!$this->_text) {
         $this->_text = SimpleSaxParser::normalise($this->_raw);
     }
     return $this->_text;
 }
Example #6
0
 /**
  *    Accessor for content reduced to visible text. Acts
  *    like a text mode browser, normalising space and
  *    reducing images to their alt text.
  *    @return string       Content as plain text.
  *    @access public
  */
 function getText()
 {
     return SimpleSaxParser::normalise($this->_content);
 }
Example #7
0
 function testHtmlEntityTranslation()
 {
     $this->assertEqual(SimpleSaxParser::normalise('&lt;&gt;&quot;&amp;'), '<>"&');
 }