Exemplo n.º 1
0
 public static function create($url, $followHtml = false, $recursions = 0)
 {
     if ($recursions > 2) {
         throw new Exception("Download error", FeedParser::ERROR_TOO_MANY_REDIRECTS);
     }
     $originalUrl = $url;
     $document = FeedParser::fetchDocument($url);
     if (!$document) {
         throw new Exception("Document is empty", FeedParser::ERROR_EMPTY_DOCUMENT);
     }
     // First pass - parse it as valid XML
     $xmlDocument = null;
     $parser = null;
     $links = array();
     libxml_use_internal_errors(true);
     try {
         $xmlDocument = @new SimpleXMLElement($document);
     } catch (Exception $e) {
     }
     if ($xmlDocument === null) {
         // Document didn't parse as valid XML
         $errors = libxml_get_errors();
         foreach ($errors as $error) {
             if ($error->code == 9) {
                 // PCDATA Invalid char value
                 $document = preg_replace('/[\\x00-\\x1f\\x80-\\xff]/', '', $document);
                 // Reparse the document
                 try {
                     $xmlDocument = @new SimpleXMLElement($document);
                 } catch (Exception $e) {
                 }
                 break;
             }
         }
     }
     if ($xmlDocument === null && $followHtml) {
         // Not sure if this is ideal, but let's just blindly assume
         // this is an HTML document and try to parse any rel=alternate
         // links
         $links = FeedParser::extractLinks($url, $document);
         if (count($links) > 0) {
             if ($parser = FeedParser::create($links[0]->url, $followHtml, $recursions + 1)) {
                 $parser->links[] = $url;
                 if ($url != $originalUrl) {
                     $parser->links[] = $originalUrl;
                 }
             }
             return $parser;
         }
     }
     if ($xmlDocument) {
         // Valid XML. See if we can determine the type of content
         $rootName = $xmlDocument->getName();
         if ($rootName == 'feed') {
             $parser = new AtomParser();
         } else {
             if ($rootName == 'rss' || $rootName == 'RDF') {
                 $parser = new RssParser();
             } else {
                 if ($followHtml && strcasecmp($rootName, 'html') === 0) {
                     // HTML document. See if we can find a feed by parsing the HTML
                     $links = FeedParser::extractLinks($url, $document);
                     if (count($links) > 0) {
                         if ($parser = FeedParser::create($url, $followHtml, $recursions + 1)) {
                             $parser->links[] = $url;
                             if ($url != $originalUrl) {
                                 $parser->links[] = $originalUrl;
                             }
                         }
                         return $parser;
                     }
                 }
             }
         }
     }
     if ($parser) {
         $parser->url = $url;
         $parser->document = $document;
         $parser->xml = $xmlDocument;
     }
     return $parser;
 }