public function testStripHeadTag() { $input = '<html><head><title>test</title></head><body><h1>boo</h1></body>'; $expected = '<html><body><h1>boo</h1></body>'; $this->assertEquals($expected, Filter::stripHeadTags($input)); $input = file_get_contents('tests/fixtures/html4_page.html'); $expected = file_get_contents('tests/fixtures/html4_head_stripped_page.html'); $this->assertEquals($expected, Filter::stripHeadTags($input)); $input = file_get_contents('tests/fixtures/html_page.html'); $expected = file_get_contents('tests/fixtures/html_head_stripped_page.html'); $this->assertEquals($expected, Filter::stripHeadTags($input)); }
/** * Normalize encoding and strip head tag. */ public function prepareHtml() { $html_encoding = XmlParser::getEncodingFromMetaTag($this->html); $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding); $this->html = Filter::stripHeadTags($this->html); Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $this->encoding . '" ; HTML Encoding "' . $html_encoding . '"'); }
/** * Parse the HTML content * * @access public * @return bool */ public function parse() { if ($this->skip_processing) { return true; } if ($this->html) { $html_encoding = XmlParser::getEncodingFromMetaTag($this->html); // Encode everything in UTF-8 Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $this->encoding . '" ; HTML Encoding "' . $html_encoding . '"'); $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding); $this->html = Filter::stripHeadTags($this->html); Logger::setMessage(get_called_class() . ': Content length: ' . strlen($this->html) . ' bytes'); $rules = $this->getRules(); if (is_array($rules)) { Logger::setMessage(get_called_class() . ': Parse content with rules'); $this->parseContentWithRules($rules); } else { Logger::setMessage(get_called_class() . ': Parse content with candidates'); $this->parseContentWithCandidates(); } } else { Logger::setMessage(get_called_class() . ': No content fetched'); } Logger::setMessage(get_called_class() . ': Content length: ' . strlen($this->content) . ' bytes'); Logger::setMessage(get_called_class() . ': Grabber done'); return $this->content !== ''; }