public static function create($url, $followHtml = false, $recursions = 0) { if ($recursions > 2) { throw new Exception("Download error", FeedParser::ERROR_TOO_MANY_REDIRECTS); } $originalUrl = $url; $document = FeedParser::fetchDocument($url); if (!$document) { throw new Exception("Document is empty", FeedParser::ERROR_EMPTY_DOCUMENT); } // First pass - parse it as valid XML $xmlDocument = null; $parser = null; $links = array(); libxml_use_internal_errors(true); try { $xmlDocument = @new SimpleXMLElement($document); } catch (Exception $e) { } if ($xmlDocument === null) { // Document didn't parse as valid XML $errors = libxml_get_errors(); foreach ($errors as $error) { if ($error->code == 9) { // PCDATA Invalid char value $document = preg_replace('/[\\x00-\\x1f\\x80-\\xff]/', '', $document); // Reparse the document try { $xmlDocument = @new SimpleXMLElement($document); } catch (Exception $e) { } break; } } } if ($xmlDocument === null && $followHtml) { // Not sure if this is ideal, but let's just blindly assume // this is an HTML document and try to parse any rel=alternate // links $links = FeedParser::extractLinks($url, $document); if (count($links) > 0) { if ($parser = FeedParser::create($links[0]->url, $followHtml, $recursions + 1)) { $parser->links[] = $url; if ($url != $originalUrl) { $parser->links[] = $originalUrl; } } return $parser; } } if ($xmlDocument) { // Valid XML. See if we can determine the type of content $rootName = $xmlDocument->getName(); if ($rootName == 'feed') { $parser = new AtomParser(); } else { if ($rootName == 'rss' || $rootName == 'RDF') { $parser = new RssParser(); } else { if ($followHtml && strcasecmp($rootName, 'html') === 0) { // HTML document. See if we can find a feed by parsing the HTML $links = FeedParser::extractLinks($url, $document); if (count($links) > 0) { if ($parser = FeedParser::create($url, $followHtml, $recursions + 1)) { $parser->links[] = $url; if ($url != $originalUrl) { $parser->links[] = $originalUrl; } } return $parser; } } } } } if ($parser) { $parser->url = $url; $parser->document = $document; $parser->xml = $xmlDocument; } return $parser; }