/** * Turns HTML into text browser visible text. Images * are converted to their alt text and tags are supressed. * Entities are converted to their visible representation. * @param string $html HTML to convert. * @return string Plain text. * @access public * @static */ function normalise($html) { $text = preg_replace('|<img.*?alt\\s*=\\s*"(.*?)".*?>|', ' \\1 ', $html); $text = preg_replace('|<img.*?alt\\s*=\\s*\'(.*?)\'.*?>|', ' \\1 ', $text); $text = preg_replace('|<img.*?alt\\s*=\\s*([a-zA-Z_]+).*?>|', ' \\1 ', $text); $text = preg_replace('|<.*?>|', '', $text); $text = SimpleSaxParser::decodeHtml($text); $text = preg_replace('|\\s+|', ' ', $text); return trim($text); }
function setUp() { $this->_handler =& new MockSimpleSaxParser($this); $this->_handler->setReturnValue("acceptStartToken", true); $this->_handler->setReturnValue("acceptEndToken", true); $this->_handler->setReturnValue("acceptAttributeToken", true); $this->_handler->setReturnValue("acceptEntityToken", true); $this->_handler->setReturnValue("acceptTextToken", true); $this->_handler->setReturnValue("ignore", true); $this->_lexer =& SimpleSaxParser::createLexer($this->_handler); }
/** * Reads the raw content and send events * into the page to be built. * @param string $raw Unparsed text. * @param SimpleSaxParser $parser Event generator. * @access public */ function parse($raw, &$parser) { return $parser->parse($raw); }
/** * Pattern matches to parse the inside of a tag * including the attributes and their quoting. * @param SimpleLexer $lexer Lexer to add patterns to. * @access private * @static */ function _addInTagTokens(&$lexer) { $lexer->mapHandler('tag', 'acceptStartToken'); $lexer->addSpecialPattern('\s+', 'tag', 'ignore'); SimpleSaxParser::_addAttributeTokens($lexer); $lexer->addExitPattern('>', 'tag'); }
/** * Accessor for plain text of page as a text browser * would see it. * @return string Plain text of page. * @access public */ function getText() { if (!$this->_text) { $this->_text = SimpleSaxParser::normalise($this->_raw); } return $this->_text; }
/** * Accessor for content reduced to visible text. Acts * like a text mode browser, normalising space and * reducing images to their alt text. * @return string Content as plain text. * @access public */ function getText() { return SimpleSaxParser::normalise($this->_content); }
function testHtmlEntityTranslation() { $this->assertEqual(SimpleSaxParser::normalise('<>"&'), '<>"&'); }