/**
  *    Reads the raw content and send events
  *    into the page to be built.
  *    @param $response SimpleHttpResponse  Fetched response.
  *    @return SimplePage                   Newly parsed page.
  *    @access public
  */
 function parse($response)
 {
     $this->tags = array();
     $this->page = $this->createPage($response);
     $parser = new SimpleHtmlSaxParser($this);
     $parser->parse($response->getContent());
     $this->acceptPageEnd();
     $page = $this->page;
     $this->free();
     return $page;
 }
Example #2
0
 function testHtmlEntityTranslation()
 {
     $this->assertEqual(SimpleHtmlSaxParser::normalise('&lt;&gt;&quot;&amp;&#039;'), '<>"&\'');
 }
Example #3
0
 /**
  *    Accessor for plain text of page as a text browser
  *    would see it.
  *    @return string        Plain text of page.
  *    @access public
  */
 function getText()
 {
     if (!$this->_text) {
         $this->_text = SimpleHtmlSaxParser::normalise($this->_raw);
     }
     return $this->_text;
 }
Example #4
0
 /**
  *    Accessor for starting value.
  *    @return string        Parsed value.
  *    @access public
  */
 function getDefault()
 {
     return $this->_wrap(SimpleHtmlSaxParser::decodeHtml($this->getContent()));
 }
Example #5
0
 function testNestedFrameInFrameset()
 {
     $listener = $this->createListener();
     $listener->expectAt(0, 'startElement', array('frameset', array()));
     $listener->expectAt(1, 'startElement', array('frame', array('src' => 'frame.html')));
     $listener->expectCallCount('startElement', 2);
     $listener->expectOnce('addContent', array('<noframes>Hello</noframes>'));
     $listener->expectOnce('endElement', array('frameset'));
     $parser = new SimpleHtmlSaxParser($listener);
     $this->assertTrue($parser->parse('<frameset><frame src="frame.html"><noframes>Hello</noframes></frameset>'));
 }
Example #6
0
 /**
  *    Turns HTML into text browser visible text. Images
  *    are converted to their alt text and tags are supressed.
  *    Entities are converted to their visible representation.
  *    @param string $html        HTML to convert.
  *    @return string             Plain text.
  *    @access public
  */
 static function normalise($html)
 {
     $text = preg_replace('|<!--.*?-->|', '', $html);
     $text = preg_replace('|<script[^>]*>.*?</script>|', '', $text);
     $text = preg_replace('|<img[^>]*alt\\s*=\\s*"([^"]*)"[^>]*>|', ' \\1 ', $text);
     $text = preg_replace('|<img[^>]*alt\\s*=\\s*\'([^\']*)\'[^>]*>|', ' \\1 ', $text);
     $text = preg_replace('|<img[^>]*alt\\s*=\\s*([a-zA-Z_]+)[^>]*>|', ' \\1 ', $text);
     $text = preg_replace('|<[^>]*>|', '', $text);
     $text = SimpleHtmlSaxParser::decodeHtml($text);
     $text = preg_replace('|\\s+|', ' ', $text);
     return trim(trim($text), "�");
     // TODO: The \xAO is a &nbsp;. Add a test for this.
 }
Example #7
0
 /**
  *    Turns HTML into text browser visible text. Images
  *    are converted to their alt text and tags are supressed.
  *    Entities are converted to their visible representation.
  *    @param string $html        HTML to convert.
  *    @return string             Plain text.
  *    @access public
  *    @static
  */
 function normalise($html)
 {
     $text = preg_replace('|<!--.*?-->|', '', $html);
     $text = preg_replace('|<img.*?alt\\s*=\\s*"(.*?)".*?>|', ' \\1 ', $text);
     $text = preg_replace('|<img.*?alt\\s*=\\s*\'(.*?)\'.*?>|', ' \\1 ', $text);
     $text = preg_replace('|<img.*?alt\\s*=\\s*([a-zA-Z_]+).*?>|', ' \\1 ', $text);
     $text = preg_replace('|<.*?>|', '', $text);
     $text = SimpleHtmlSaxParser::decodeHtml($text);
     $text = preg_replace('|\\s+|', ' ', $text);
     return trim($text);
 }
Example #8
0
 /**
  *    Turns HTML into text browser visible text. Images
  *    are converted to their alt text and tags are supressed.
  *    Entities are converted to their visible representation.
  *    @param string $html        HTML to convert.
  *    @return string             Plain text.
  *    @access public
  *    @static
  */
 function normalise($html)
 {
     $text = preg_replace('|<!--.*?-->|', '', $html);
     $text = preg_replace('|<script[^>]*>.*?</script>|', '', $text);
     $text = preg_replace('|<img[^>]*alt\\s*=\\s*"([^>]*)"[^>]*>|', ' \\1 ', $text);
     $text = preg_replace('|<img[^>]*alt\\s*=\\s*\'([^>]*)\'[^>]*>|', ' \\1 ', $text);
     $text = preg_replace('|<img[^>]*alt\\s*=\\s*([a-zA-Z_]+)[^>]*>|', ' \\1 ', $text);
     $text = preg_replace('|<[^>]*>|', '', $text);
     $text = SimpleHtmlSaxParser::decodeHtml($text);
     $text = preg_replace('|\\s+|', ' ', $text);
     return trim($text);
 }