/** * Reads the raw content and send events * into the page to be built. * @param $response SimpleHttpResponse Fetched response. * @return SimplePage Newly parsed page. * @access public */ function parse($response) { $this->tags = array(); $this->page = $this->createPage($response); $parser = new SimpleHtmlSaxParser($this); $parser->parse($response->getContent()); $this->acceptPageEnd(); $page = $this->page; $this->free(); return $page; }
function testHtmlEntityTranslation() { $this->assertEqual(SimpleHtmlSaxParser::normalise('<>"&''), '<>"&\''); }
/** * Accessor for plain text of page as a text browser * would see it. * @return string Plain text of page. * @access public */ function getText() { if (!$this->_text) { $this->_text = SimpleHtmlSaxParser::normalise($this->_raw); } return $this->_text; }
/** * Accessor for starting value. * @return string Parsed value. * @access public */ function getDefault() { return $this->_wrap(SimpleHtmlSaxParser::decodeHtml($this->getContent())); }
function testNestedFrameInFrameset() { $listener = $this->createListener(); $listener->expectAt(0, 'startElement', array('frameset', array())); $listener->expectAt(1, 'startElement', array('frame', array('src' => 'frame.html'))); $listener->expectCallCount('startElement', 2); $listener->expectOnce('addContent', array('<noframes>Hello</noframes>')); $listener->expectOnce('endElement', array('frameset')); $parser = new SimpleHtmlSaxParser($listener); $this->assertTrue($parser->parse('<frameset><frame src="frame.html"><noframes>Hello</noframes></frameset>')); }
/** * Turns HTML into text browser visible text. Images * are converted to their alt text and tags are supressed. * Entities are converted to their visible representation. * @param string $html HTML to convert. * @return string Plain text. * @access public */ static function normalise($html) { $text = preg_replace('|<!--.*?-->|', '', $html); $text = preg_replace('|<script[^>]*>.*?</script>|', '', $text); $text = preg_replace('|<img[^>]*alt\\s*=\\s*"([^"]*)"[^>]*>|', ' \\1 ', $text); $text = preg_replace('|<img[^>]*alt\\s*=\\s*\'([^\']*)\'[^>]*>|', ' \\1 ', $text); $text = preg_replace('|<img[^>]*alt\\s*=\\s*([a-zA-Z_]+)[^>]*>|', ' \\1 ', $text); $text = preg_replace('|<[^>]*>|', '', $text); $text = SimpleHtmlSaxParser::decodeHtml($text); $text = preg_replace('|\\s+|', ' ', $text); return trim(trim($text), "�"); // TODO: The \xAO is a . Add a test for this. }
/** * Turns HTML into text browser visible text. Images * are converted to their alt text and tags are supressed. * Entities are converted to their visible representation. * @param string $html HTML to convert. * @return string Plain text. * @access public * @static */ function normalise($html) { $text = preg_replace('|<!--.*?-->|', '', $html); $text = preg_replace('|<img.*?alt\\s*=\\s*"(.*?)".*?>|', ' \\1 ', $text); $text = preg_replace('|<img.*?alt\\s*=\\s*\'(.*?)\'.*?>|', ' \\1 ', $text); $text = preg_replace('|<img.*?alt\\s*=\\s*([a-zA-Z_]+).*?>|', ' \\1 ', $text); $text = preg_replace('|<.*?>|', '', $text); $text = SimpleHtmlSaxParser::decodeHtml($text); $text = preg_replace('|\\s+|', ' ', $text); return trim($text); }
/** * Turns HTML into text browser visible text. Images * are converted to their alt text and tags are supressed. * Entities are converted to their visible representation. * @param string $html HTML to convert. * @return string Plain text. * @access public * @static */ function normalise($html) { $text = preg_replace('|<!--.*?-->|', '', $html); $text = preg_replace('|<script[^>]*>.*?</script>|', '', $text); $text = preg_replace('|<img[^>]*alt\\s*=\\s*"([^>]*)"[^>]*>|', ' \\1 ', $text); $text = preg_replace('|<img[^>]*alt\\s*=\\s*\'([^>]*)\'[^>]*>|', ' \\1 ', $text); $text = preg_replace('|<img[^>]*alt\\s*=\\s*([a-zA-Z_]+)[^>]*>|', ' \\1 ', $text); $text = preg_replace('|<[^>]*>|', '', $text); $text = SimpleHtmlSaxParser::decodeHtml($text); $text = preg_replace('|\\s+|', ' ', $text); return trim($text); }