Exemple #1
0
 /**
  * Parse the HTML content
  *
  * @access public
  * @return bool
  */
 public function parse()
 {
     if ($this->skip_processing) {
         return true;
     }
     if ($this->html) {
         $html_encoding = XmlParser::getEncodingFromMetaTag($this->html);
         // Encode everything in UTF-8
         Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $this->encoding . '" ; HTML Encoding "' . $html_encoding . '"');
         $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding);
         $this->html = Filter::stripHeadTags($this->html);
         Logger::setMessage(get_called_class() . ': Content length: ' . strlen($this->html) . ' bytes');
         $rules = $this->getRules();
         if (is_array($rules)) {
             Logger::setMessage(get_called_class() . ': Parse content with rules');
             $this->parseContentWithRules($rules);
         } else {
             Logger::setMessage(get_called_class() . ': Parse content with candidates');
             $this->parseContentWithCandidates();
         }
     } else {
         Logger::setMessage(get_called_class() . ': No content fetched');
     }
     Logger::setMessage(get_called_class() . ': Content length: ' . strlen($this->content) . ' bytes');
     Logger::setMessage(get_called_class() . ': Grabber done');
     return $this->content !== '';
 }
Exemple #2
0
 /**
  * Normalize encoding and strip head tag.
  */
 public function prepareHtml()
 {
     $html_encoding = XmlParser::getEncodingFromMetaTag($this->html);
     $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding);
     $this->html = Filter::stripHeadTags($this->html);
     Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $this->encoding . '" ; HTML Encoding "' . $html_encoding . '"');
 }
 public function testGetEncodingFromMetaTag()
 {
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1"/>'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1" />'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'text/html;charset=iso-8859-1\'/>'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'text/html;charset=iso-8859-1\' />'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=Content-Type content=text/html;charset=iso-8859-1/>'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=Content-Type content=text/html;charset=iso-8859-1 />'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="text/html;charset=ISO-8859-1">'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1" >'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'text/html;charset=iso-8859-1\'>'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'text/html;charset=iso-8859-1\' >'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=Content-Type content=text/html;charset=iso-8859-1>'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=Content-Type content=text/html;charset=iso-8859-1 >'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="text/html;charset=\'iso-8859-1\'">'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="\'text/html;charset=iso-8859-1\'">'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="\'text/html\';charset=\'iso-8859-1\'">'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'text/html;charset="iso-8859-1"\'>'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'"text/html;charset=iso-8859-1"\'>'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'"text/html";charset="iso-8859-1"\'>'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="text/html;;;charset=iso-8859-1">'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="text/html;;;charset=\'iso-8859-1\'">'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="\'text/html;;;charset=iso-8859-1\'">'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="\'text/html\';;;charset=\'iso-8859-1\'">'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'text/html;;;charset=iso-8859-1\'>'));
     $this->assertEquals('windows-1251', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'text/html;;;charset="windows-1251"\'>'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'"text/html;;;charset=iso-8859-1"\'>'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv=\'Content-Type\' content=\'"text/html";;;charset="iso-8859-1"\'>'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta  http-equiv  =  Content-Type  content  =  text/html;charset=iso-8859-1  >'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta  content  =  text/html;charset=iso-8859-1  http-equiv  =  Content-Type  >'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta  http-equiv  =  Content-Type  content  =  text/html  ;  charset  =  iso-8859-1  >'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta  content  =  text/html  ;  charset  =  iso-8859-1  http-equiv  =  Content-Type  >'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta  http-equiv  =  Content-Type  content  =  text/html  ;;;  charset  =  iso-8859-1  >'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta  content  =  text/html  ;;;  charset  =  iso-8859-1  http-equiv  =  Content-Type  >'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta  http-equiv  =  Content-Type  content  =  text/html  ;  ;  ;  charset  =  iso-8859-1  >'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta  content  =  text/html  ;  ;  ;  charset  =  iso-8859-1  http-equiv  =  Content-Type  >'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset="uTf-8"/>'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset="utf-8" />'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset=\'Utf-8\'/>'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset=\'utf-8\' />'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset=utf-8/>'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset=utf-8 />'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset="utf-8">'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset="utf-8" >'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset=\'utf-8\'>'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset=\'utf-8\' >'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset=utf-8>'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta charset=utf-8 >'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta  charset  =  "  utf-8  "  >'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta  charset  =  \'  utf-8  \'  >'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta  charset  =  "  utf-8  \'  >'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta  charset  =  \'  utf-8  "  >'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta  charset  =  "  utf-8     >'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta  charset  =  \'  utf-8     >'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta  charset  =     utf-8  \'  >'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta  charset  =     utf-8  "  >'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta  charset  =     utf-8     >'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta  charset  =     utf-8    />'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta name="title" value="charset=utf-8 — is it really useful (yep)?">'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta value="charset=utf-8 — is it really useful (yep)?" name="title">'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta name="title" content="charset=utf-8 — is it really useful (yep)?">'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta name="charset=utf-8" content="charset=utf-8 — is it really useful (yep)?">'));
     $this->assertEquals('utf-8', XmlParser::getEncodingFromMetaTag('<meta content="charset=utf-8 — is it really useful (nope, not here, but gotta admit pretty robust otherwise)?" name="title">'));
     $this->assertEquals('iso-8859-1', XmlParser::getEncodingFromMetaTag('<meta http-equiv="Content-Type" content="text/html;charset=iSo-8859-1"/><meta charset="invalid" />'));
 }