/** * Load feed data, convert and clean it, and return its string value. * * @param string $url * @return string RSS xml * @access protected * @since 7/8/08 */ protected function loadFeedXml($url) { $feedData = @file_get_contents($url); if (!strlen($feedData)) { throw new OperationFailedException("Could not access feed, '" . $url . "'."); } $feed = new DOMDocument(); // If the encoding is not UTF-8, convert the document if (preg_match('/^<\\?xml .*encoding=[\'"]([a-zA-Z0-9-]+)[\'"].*\\?>/m', $feedData, $matches)) { $encoding = $matches[1]; if (strtoupper($encoding) != 'UTF8' && strtoupper($encoding) != 'UTF-8') { $feedData = mb_convert_encoding($feedData, 'UTF-8', strtoupper($encoding)); $feedData = preg_replace('/^(<\\?xml .*encoding=[\'"])([a-zA-Z0-9-]+)([\'"].*\\?>)/m', '\\1UTF-8\\3', $feedData); } } // Convert any non-UTF-8 characters $string = String::withValue($feedData); $string->makeUtf8(); $feedData = $string->asString(); if (!@$feed->loadXML($feedData)) { throw new OperationFailedException("Invalid feed data: \"" . $feedData . "\" for URL: " . $url); } // Handle any format conversions $feed = $this->convertToRss($feed); // Validate Feed. // $tmpFeed = $feed; // $feed = new Harmoni_DOMDocument; // $feed->loadXML($tmpFeed->saveXML()); // unset($tmpFeed); // $feed->schemaValidateWithException(dirname(__FILE__).'/rss-2_0-lax.xsd'); // Run through the titles, authors, and descriptions and clean out any unsafe HTML foreach ($feed->getElementsByTagName('title') as $element) { $element->nodeValue = strip_tags(htmlspecialchars_decode($element->nodeValue)); } foreach ($feed->getElementsByTagName('author') as $element) { $element->nodeValue = strip_tags(htmlspecialchars_decode($element->nodeValue)); } foreach ($feed->getElementsByTagName('comments') as $element) { $element->nodeValue = htmlentities(strip_tags(html_entity_decode($element->nodeValue))); } foreach ($feed->getElementsByTagName('link') as $element) { $element->nodeValue = htmlentities(strip_tags(html_entity_decode($element->nodeValue))); } foreach ($feed->getElementsByTagName('description') as $description) { $html = HtmlString::fromString(htmlspecialchars_decode($description->nodeValue)); $html->cleanXSS(); $description->nodeValue = htmlspecialchars($html->asString()); } // Move the feed into a dom document. $tmpFeed = $feed; $feed = new Harmoni_DOMDocument(); $feed->loadXML($tmpFeed->saveXML()); unset($tmpFeed); // Validate the feed again // $feed->schemaValidateWithException(dirname(__FILE__).'/rss-2_0-lax.xsd'); // Just ensure a few basic things: if (!$feed->documentElement->nodeName == 'rss') { throw new DOMDocumentException("Feed root must be an rss element"); } // Check for channels foreach ($feed->documentElement->childNodes as $element) { if ($element->nodeType == 1 && $element->nodeName != 'channel') { throw new DOMDocumentException("'" . $node->nodeName . "' is not expected, expecting 'channel'."); } } // Check dates foreach ($feed->getElementsByTagName('pubdate') as $element) { if (!preg_match('/(((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun)), *)?\\d\\d? +((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec)) +\\d\\d(\\d\\d)? +\\d\\d:\\d\\d(:\\d\\d)? +(([+\\-]?\\d\\d\\d\\d)|(UT)|(GMT)|(EST)|(EDT)|(CST)|(CDT)|(MST)|(MDT)|(PST)|(PDT)|\\w)/', $element->nodeValue)) { throw new DOMDocumentException("'" . $element->nodeValue . "' is not a valid date."); } } return $feed->saveXMLWithWhitespace(); }