/** * Constructor. * * @param string $content Feed content * @param string $http_encoding HTTP encoding (headers) * @param string $fallback_url Fallback url when the feed provide relative or broken url */ public function __construct($content, $http_encoding = '', $fallback_url = '') { $this->fallback_url = $fallback_url; $xml_encoding = XmlParser::getEncodingFromXmlTag($content); // Strip XML tag to avoid multiple encoding/decoding in the next XML processing $this->content = Filter::stripXmlTag($content); // Encode everything in UTF-8 Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $http_encoding . '" ; XML Encoding "' . $xml_encoding . '"'); $this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding); $this->itemPostProcessor = new ItemPostProcessor($this->config); $this->itemPostProcessor->register(new ContentGeneratorProcessor($this->config)); $this->itemPostProcessor->register(new ContentFilterProcessor($this->config)); }
/** * Constructor * * @access public * @param string $content Feed content * @param string $http_encoding HTTP encoding (headers) * @param string $fallback_url Fallback url when the feed provide relative or broken url */ public function __construct($content, $http_encoding = '', $fallback_url = '') { $this->date = new DateParser(); $this->fallback_url = $fallback_url; $xml_encoding = XmlParser::getEncodingFromXmlTag($content); // Strip XML tag to avoid multiple encoding/decoding in the next XML processing $this->content = Filter::stripXmlTag($content); // Encode everything in UTF-8 Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $http_encoding . '" ; XML Encoding "' . $xml_encoding . '"'); $this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding); // Workarounds $this->content = Filter::normalizeData($this->content); }
/** * Normalize encoding and strip head tag. */ public function prepareHtml() { $html_encoding = XmlParser::getEncodingFromMetaTag($this->html); $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding); $this->html = Filter::stripHeadTags($this->html); Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $this->encoding . '" ; HTML Encoding "' . $html_encoding . '"'); }
/** * Parse the HTML content * * @access public * @return bool */ public function parse() { if ($this->skip_processing) { return true; } if ($this->html) { $html_encoding = XmlParser::getEncodingFromMetaTag($this->html); // Encode everything in UTF-8 Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $this->encoding . '" ; HTML Encoding "' . $html_encoding . '"'); $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding); $this->html = Filter::stripHeadTags($this->html); Logger::setMessage(get_called_class() . ': Content length: ' . strlen($this->html) . ' bytes'); $rules = $this->getRules(); if (is_array($rules)) { Logger::setMessage(get_called_class() . ': Parse content with rules'); $this->parseContentWithRules($rules); } else { Logger::setMessage(get_called_class() . ': Parse content with candidates'); $this->parseContentWithCandidates(); } } else { Logger::setMessage(get_called_class() . ': No content fetched'); } Logger::setMessage(get_called_class() . ': Content length: ' . strlen($this->content) . ' bytes'); Logger::setMessage(get_called_class() . ': Grabber done'); return $this->content !== ''; }