/** * Finds potential feed tags in an HTML document. * * @param string $url * The URL of the document, to use as a base URL. * @param string $html * The HTML document to search. * * @return string|false * The URL of the first feed link found, or false if unable to find a link. */ public static function findFeed($url, $html) { $use_error = libxml_use_internal_errors(TRUE); $entity_loader = libxml_disable_entity_loader(TRUE); $dom = new \DOMDocument(); $status = $dom->loadHTML(trim($html)); libxml_disable_entity_loader($entity_loader); libxml_use_internal_errors($use_error); if (!$status) { return FALSE; } $feed_set = new FeedSet(); $feed_set->addLinks($dom->getElementsByTagName('link'), $url); // Load the first feed type found. foreach (['atom', 'rss', 'rdf'] as $feed_type) { if (isset($feed_set->{$feed_type})) { return $feed_set->{$feed_type}; } } return FALSE; }
/** * Find feed links * * @param $uri * @return FeedSet * @throws Exception\RuntimeException */ public static function findFeedLinks($uri) { $client = static::getHttpClient(); $client->setUri($uri); $response = $client->send(); if ($response->getStatusCode() !== 200) { throw new Exception\RuntimeException("Failed to access {$uri}, got response code " . $response->getStatusCode()); } $responseHtml = $response->getBody(); $libxmlErrflag = libxml_use_internal_errors(true); $oldValue = libxml_disable_entity_loader(true); $dom = new DOMDocument(); $status = $dom->loadHTML(trim($responseHtml)); libxml_disable_entity_loader($oldValue); libxml_use_internal_errors($libxmlErrflag); if (!$status) { // Build error message $error = libxml_get_last_error(); if ($error && $error->message) { $error->message = trim($error->message); $errormsg = "DOMDocument cannot parse HTML: {$error->message}"; } else { $errormsg = "DOMDocument cannot parse HTML: Please check the XML document's validity"; } throw new Exception\RuntimeException($errormsg); } $feedSet = new FeedSet(); $links = $dom->getElementsByTagName('link'); $feedSet->addLinks($links, $uri); return $feedSet; }
public static function findFeedLinks($uri) { $client = self::getHttpClient(); $client->setUri($uri); $response = $client->request(); if ($response->getStatus() !== 200) { throw new Exception("Failed to access {$uri}, got response code " . $response->getStatus()); } $responseHtml = $response->getBody(); $libxml_errflag = libxml_use_internal_errors(true); $dom = new \DOMDocument(); $status = $dom->loadHTML($responseHtml); libxml_use_internal_errors($libxml_errflag); if (!$status) { $error = libxml_get_last_error(); if ($error && $error->message) { $errormsg = "\\DOMDocument cannot parse HTML: {$error->message}"; } else { $errormsg = "\\DOMDocument cannot parse HTML: Please check the XML document's validity"; } throw new Exception($errormsg); } $feedSet = new FeedSet(); $links = $dom->getElementsByTagName('link'); $feedSet->addLinks($links, $uri); return $feedSet; }