/** * Parse the document. * * @return \PicoFeed\Parser\Feed */ public function execute() { Logger::setMessage(get_called_class() . ': begin parsing'); $xml = XmlParser::getSimpleXml($this->content); if ($xml === false) { Logger::setMessage(get_called_class() . ': Applying XML workarounds'); $this->content = Filter::normalizeData($this->content); $xml = XmlParser::getSimpleXml($this->content); if ($xml === false) { Logger::setMessage(get_called_class() . ': XML parsing error'); Logger::setMessage(XmlParser::getErrors()); throw new MalformedXmlException('XML parsing error'); } } $this->used_namespaces = $xml->getNamespaces(true); $xml = $this->registerSupportedNamespaces($xml); $feed = new Feed(); $this->findFeedUrl($xml, $feed); $this->checkFeedUrl($feed); $this->findSiteUrl($xml, $feed); $this->checkSiteUrl($feed); $this->findFeedTitle($xml, $feed); $this->findFeedDescription($xml, $feed); $this->findFeedLanguage($xml, $feed); $this->findFeedId($xml, $feed); $this->findFeedDate($xml, $feed); $this->findFeedLogo($xml, $feed); $this->findFeedIcon($xml, $feed); foreach ($this->getItemsTree($xml) as $entry) { $entry = $this->registerSupportedNamespaces($entry); $item = new Item(); $item->xml = $entry; $item->namespaces = $this->used_namespaces; $this->findItemAuthor($xml, $entry, $item); $this->findItemUrl($entry, $item); $this->checkItemUrl($feed, $item); $this->findItemTitle($entry, $item); $this->findItemContent($entry, $item); // Id generation can use the item url/title/content (order is important) $this->findItemId($entry, $item, $feed); $this->findItemDate($entry, $item, $feed); $this->findItemEnclosure($entry, $item, $feed); $this->findItemLanguage($entry, $item, $feed); $this->itemPostProcessor->execute($feed, $item); $feed->items[] = $item; } Logger::setMessage(get_called_class() . PHP_EOL . $feed); return $feed; }
/** * Constructor * * @access public * @param string $content Feed content * @param string $http_encoding HTTP encoding (headers) * @param string $fallback_url Fallback url when the feed provide relative or broken url */ public function __construct($content, $http_encoding = '', $fallback_url = '') { $this->date = new DateParser(); $this->fallback_url = $fallback_url; $xml_encoding = XmlParser::getEncodingFromXmlTag($content); // Strip XML tag to avoid multiple encoding/decoding in the next XML processing $this->content = Filter::stripXmlTag($content); // Encode everything in UTF-8 Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $http_encoding . '" ; XML Encoding "' . $xml_encoding . '"'); $this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding); // Workarounds $this->content = Filter::normalizeData($this->content); }
public function testNormalizeData() { // invalid data link escape control character $this->assertEquals('<xml>random text</xml>', Filter::normalizeData("<xml>random text</xml>")); $this->assertEquals('<xml>random text</xml>', Filter::normalizeData("<xml>random text</xml>")); $this->assertEquals('<xml>random text</xml>', Filter::normalizeData("<xml>random text</xml>")); // invalid unit seperator control character (lower and upper case) $this->assertEquals('<xml>random text</xml>', Filter::normalizeData("<xml>random text</xml>")); $this->assertEquals('<xml>random text</xml>', Filter::normalizeData("<xml>random text</xml>")); $this->assertEquals('<xml>random text</xml>', Filter::normalizeData("<xml>random text</xml>")); $this->assertEquals('<xml>random text</xml>', Filter::normalizeData("<xml>random text</xml>")); $this->assertEquals('<xml>random text</xml>', Filter::normalizeData("<xml>random text</xml>")); /* * Do not test invalid multibyte characters. The output depends on php * version and character. * * php 5.3: always null * php >5.3: sometime null, sometimes the stripped string */ // invalid backspace control character + valid multibyte character $this->assertEquals('<xml>“random“ text</xml>', Filter::normalizeData("<xml>“random“ text</xml>")); $this->assertEquals('<xml>“random“ text</xml>', Filter::normalizeData("<xml>“random“ text</xml>")); $this->assertEquals('<xml>“random“ text</xml>', Filter::normalizeData("<xml>“random“ text</xml>")); // do not convert valid entities to utf-8 character $this->assertEquals('<xml attribute=""value"">random text</xml>', Filter::normalizeData('<xml attribute=""value"">random text</xml>')); $this->assertEquals('<xml attribute=""value"">random text</xml>', Filter::normalizeData('<xml attribute=""value"">random text</xml>')); }