Exemple #1
0
 /**
  * Load feed data, convert and clean it, and return its string value.
  * 
  * @param string $url
  * @return string RSS xml
  * @access protected
  * @since 7/8/08
  */
 protected function loadFeedXml($url)
 {
     $feedData = @file_get_contents($url);
     if (!strlen($feedData)) {
         throw new OperationFailedException("Could not access feed, '" . $url . "'.");
     }
     $feed = new DOMDocument();
     // If the encoding is not UTF-8, convert the document
     if (preg_match('/^<\\?xml .*encoding=[\'"]([a-zA-Z0-9-]+)[\'"].*\\?>/m', $feedData, $matches)) {
         $encoding = $matches[1];
         if (strtoupper($encoding) != 'UTF8' && strtoupper($encoding) != 'UTF-8') {
             $feedData = mb_convert_encoding($feedData, 'UTF-8', strtoupper($encoding));
             $feedData = preg_replace('/^(<\\?xml .*encoding=[\'"])([a-zA-Z0-9-]+)([\'"].*\\?>)/m', '\\1UTF-8\\3', $feedData);
         }
     }
     // Convert any non-UTF-8 characters
     $string = String::withValue($feedData);
     $string->makeUtf8();
     $feedData = $string->asString();
     if (!@$feed->loadXML($feedData)) {
         throw new OperationFailedException("Invalid feed data: \"" . $feedData . "\" for URL: " . $url);
     }
     // Handle any format conversions
     $feed = $this->convertToRss($feed);
     // Validate Feed.
     // 		$tmpFeed = $feed;
     // 		$feed = new Harmoni_DOMDocument;
     // 		$feed->loadXML($tmpFeed->saveXML());
     // 		unset($tmpFeed);
     // 		$feed->schemaValidateWithException(dirname(__FILE__).'/rss-2_0-lax.xsd');
     // Run through the titles, authors, and descriptions and clean out any unsafe HTML
     foreach ($feed->getElementsByTagName('title') as $element) {
         $element->nodeValue = strip_tags(htmlspecialchars_decode($element->nodeValue));
     }
     foreach ($feed->getElementsByTagName('author') as $element) {
         $element->nodeValue = strip_tags(htmlspecialchars_decode($element->nodeValue));
     }
     foreach ($feed->getElementsByTagName('comments') as $element) {
         $element->nodeValue = htmlentities(strip_tags(html_entity_decode($element->nodeValue)));
     }
     foreach ($feed->getElementsByTagName('link') as $element) {
         $element->nodeValue = htmlentities(strip_tags(html_entity_decode($element->nodeValue)));
     }
     foreach ($feed->getElementsByTagName('description') as $description) {
         $html = HtmlString::fromString(htmlspecialchars_decode($description->nodeValue));
         $html->cleanXSS();
         $description->nodeValue = htmlspecialchars($html->asString());
     }
     // Move the feed into a dom document.
     $tmpFeed = $feed;
     $feed = new Harmoni_DOMDocument();
     $feed->loadXML($tmpFeed->saveXML());
     unset($tmpFeed);
     // Validate the feed again
     // 		$feed->schemaValidateWithException(dirname(__FILE__).'/rss-2_0-lax.xsd');
     // Just ensure a few basic things:
     if (!$feed->documentElement->nodeName == 'rss') {
         throw new DOMDocumentException("Feed root must be an rss element");
     }
     // Check for channels
     foreach ($feed->documentElement->childNodes as $element) {
         if ($element->nodeType == 1 && $element->nodeName != 'channel') {
             throw new DOMDocumentException("'" . $node->nodeName . "' is not expected, expecting 'channel'.");
         }
     }
     // Check dates
     foreach ($feed->getElementsByTagName('pubdate') as $element) {
         if (!preg_match('/(((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun)), *)?\\d\\d? +((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec)) +\\d\\d(\\d\\d)? +\\d\\d:\\d\\d(:\\d\\d)? +(([+\\-]?\\d\\d\\d\\d)|(UT)|(GMT)|(EST)|(EDT)|(CST)|(CDT)|(MST)|(MDT)|(PST)|(PDT)|\\w)/', $element->nodeValue)) {
             throw new DOMDocumentException("'" . $element->nodeValue . "' is not a valid date.");
         }
     }
     return $feed->saveXMLWithWhitespace();
 }