/** * Get the relevant content with the list of potential attributes * * @access public */ public function parseContentWithCandidates() { $dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">' . $this->html); $xpath = new DOMXPath($dom); // Try to lookup in each tag foreach ($this->candidatesAttributes as $candidate) { Logging::setMessage(get_called_class() . ' Try this candidate: "' . $candidate . '"'); $nodes = $xpath->query('//*[(contains(@class, "' . $candidate . '") or @id="' . $candidate . '") and not (contains(@class, "nav") or contains(@class, "page"))]'); if ($nodes !== false && $nodes->length > 0) { $this->content = $dom->saveXML($nodes->item(0)); Logging::setMessage(get_called_class() . ' Find candidate "' . $candidate . '" (' . strlen($this->content) . ' bytes)'); break; } } // Try to fetch <article/> if (!$this->content) { $nodes = $xpath->query('//article'); if ($nodes !== false && $nodes->length > 0) { $this->content = $dom->saveXML($nodes->item(0)); Logging::setMessage(get_called_class() . ' Find <article/> tag (' . strlen($this->content) . ' bytes)'); } } if (strlen($this->content) < 50) { Logging::setMessage(get_called_class() . ' No enought content fetched, get the full body'); $this->content = $dom->saveXML($dom->firstChild); } Logging::setMessage(get_called_class() . ' Strip garbage'); $this->stripGarbage(); }
/** * Discover the feed url inside a HTML document and download the feed * * @access public * @return boolean */ public function discover() { if (!$this->content) { return false; } Logging::setMessage(get_called_class() . ': Try to discover a subscription'); $dom = XmlParser::getHtmlDocument($this->content); $xpath = new DOMXPath($dom); $queries = array('//link[@type="application/rss+xml"]', '//link[@type="application/atom+xml"]'); foreach ($queries as $query) { $nodes = $xpath->query($query); if ($nodes->length !== 0) { $link = $nodes->item(0)->getAttribute('href'); if (!empty($link)) { // Relative links if (strpos($link, 'http') !== 0) { if ($link[0] === '/') { $link = substr($link, 1); } if ($this->url[strlen($this->url) - 1] !== '/') { $this->url .= '/'; } $link = $this->url . $link; } Logging::setMessage(get_called_class() . ': Find subscription link: ' . $link); $this->download($link); return true; } } } return false; }