/** * Parse the HTML content * * @access public * @return bool */ public function parse() { if ($this->skip_processing) { return true; } if ($this->html) { $html_encoding = XmlParser::getEncodingFromMetaTag($this->html); // Encode everything in UTF-8 Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $this->encoding . '" ; HTML Encoding "' . $html_encoding . '"'); $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding); $this->html = Filter::stripHeadTags($this->html); Logger::setMessage(get_called_class() . ': Content length: ' . strlen($this->html) . ' bytes'); $rules = $this->getRules(); if (is_array($rules)) { Logger::setMessage(get_called_class() . ': Parse content with rules'); $this->parseContentWithRules($rules); } else { Logger::setMessage(get_called_class() . ': Parse content with candidates'); $this->parseContentWithCandidates(); } } else { Logger::setMessage(get_called_class() . ': No content fetched'); } Logger::setMessage(get_called_class() . ': Content length: ' . strlen($this->content) . ' bytes'); Logger::setMessage(get_called_class() . ': Grabber done'); return $this->content !== ''; }
/** * Normalize encoding and strip head tag. */ public function prepareHtml() { $html_encoding = XmlParser::getEncodingFromMetaTag($this->html); $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding); $this->html = Filter::stripHeadTags($this->html); Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $this->encoding . '" ; HTML Encoding "' . $html_encoding . '"'); }
public function testGetEncodingFromMetaTag() { $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1"/>')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1" />')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'text/html;charset=iso-8859-1\'/>')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'text/html;charset=iso-8859-1\' />')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=Content-Type content=text/html;charset=iso-8859-1/>')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=Content-Type content=text/html;charset=iso-8859-1 />')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="text/html;charset=ISO-8859-1">')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1" >')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'text/html;charset=iso-8859-1\'>')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'text/html;charset=iso-8859-1\' >')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=Content-Type content=text/html;charset=iso-8859-1>')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=Content-Type content=text/html;charset=iso-8859-1 >')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="text/html;charset=\'iso-8859-1\'">')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="\'text/html;charset=iso-8859-1\'">')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="\'text/html\';charset=\'iso-8859-1\'">')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'text/html;charset="iso-8859-1"\'>')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'"text/html;charset=iso-8859-1"\'>')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'"text/html";charset="iso-8859-1"\'>')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="text/html;;;charset=iso-8859-1">')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="text/html;;;charset=\'iso-8859-1\'">')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="\'text/html;;;charset=iso-8859-1\'">')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="\'text/html\';;;charset=\'iso-8859-1\'">')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'text/html;;;charset=iso-8859-1\'>')); $this->assertEquals('windows-1251', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'text/html;;;charset="windows-1251"\'>')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'"text/html;;;charset=iso-8859-1"\'>')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'"text/html";;;charset="iso-8859-1"\'>')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv = Content-Type content = text/html;charset=iso-8859-1 >')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta content = text/html;charset=iso-8859-1 http-equiv = Content-Type >')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv = Content-Type content = text/html ; charset = iso-8859-1 >')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta content = text/html ; charset = iso-8859-1 http-equiv = Content-Type >')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv = Content-Type content = text/html ;;; charset = iso-8859-1 >')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta content = text/html ;;; charset = iso-8859-1 http-equiv = Content-Type >')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv = Content-Type content = text/html ; ; ; charset = iso-8859-1 >')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta content = text/html ; ; ; charset = iso-8859-1 http-equiv = Content-Type >')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset="uTf-8"/>')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset="utf-8" />')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset=\'Utf-8\'/>')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset=\'utf-8\' />')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset=utf-8/>')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset=utf-8 />')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset="utf-8">')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset="utf-8" >')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset=\'utf-8\'>')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset=\'utf-8\' >')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset=utf-8>')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset=utf-8 >')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset = " utf-8 " >')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset = \' utf-8 \' >')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset = " utf-8 \' >')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset = \' utf-8 " >')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset = " utf-8 >')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset = \' utf-8 >')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset = utf-8 \' >')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset = utf-8 " >')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset = utf-8 >')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset = utf-8 />')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta name="title" value="charset=utf-8 — is it really useful (yep)?">')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta value="charset=utf-8 — is it really useful (yep)?" name="title">')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta name="title" content="charset=utf-8 — is it really useful (yep)?">')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta name="charset=utf-8" content="charset=utf-8 — is it really useful (yep)?">')); $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta content="charset=utf-8 — is it really useful (nope, not here, but gotta admit pretty robust otherwise)?" name="title">')); $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="text/html;charset=iSo-8859-1"/><meta charset="invalid" />')); }