/** * Constructor. * * @param string $html * @param array $rules */ public function __construct($html, array $rules) { $this->rules = $rules; $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">' . $html); $this->xpath = new DOMXPath($this->dom); }
/** * Detect the feed format * * @access public * @param string $content Feed content * @return string */ public function detectFormat($content) { $dom = XmlParser::getHtmlDocument($content); $xpath = new DOMXPath($dom); foreach ($this->formats as $parser_name => $query) { $nodes = $xpath->query($query); if ($nodes->length === 1) { return $parser_name; } } return ''; }
/** * Get the relevant content with the list of potential attributes * * @access public */ public function parseContentWithCandidates() { $dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">' . $this->html); $xpath = new DOMXPath($dom); // Try to lookup in each tag foreach ($this->candidatesAttributes as $candidate) { Logger::setMessage(get_called_class() . ': Try this candidate: "' . $candidate . '"'); $nodes = $xpath->query('//*[(contains(@class, "' . $candidate . '") or @id="' . $candidate . '") and not (contains(@class, "nav") or contains(@class, "page"))]'); if ($nodes !== false && $nodes->length > 0) { $this->content = $dom->saveXML($nodes->item(0)); Logger::setMessage(get_called_class() . ': Find candidate "' . $candidate . '" (' . strlen($this->content) . ' bytes)'); break; } } // Try to fetch <article/> if (strlen($this->content) < 200) { $nodes = $xpath->query('//article'); if ($nodes !== false && $nodes->length > 0) { $this->content = $dom->saveXML($nodes->item(0)); Logger::setMessage(get_called_class() . ': Find <article/> tag (' . strlen($this->content) . ' bytes)'); } } // Get everything if (strlen($this->content) < 50) { $nodes = $xpath->query('//body'); if ($nodes !== false && $nodes->length > 0) { Logger::setMessage(get_called_class() . ' No enought content fetched, get //body'); $this->content = $dom->saveXML($nodes->item(0)); } } Logger::setMessage(get_called_class() . ': Strip garbage'); $this->stripGarbage(); }
/** * Extract the icon links from the HTML. * * @param string $html * HTML * * @return array */ public function extract($html) { $icons = array(); if (empty($html)) { return $icons; } $dom = XmlParser::getHtmlDocument($html); $xpath = new DOMXpath($dom); $elements = $xpath->query("//link[contains(@rel, 'icon') and not(contains(@rel, 'apple'))]"); for ($i = 0; $i < $elements->length; ++$i) { $icons[] = $elements->item($i)->getAttribute('href'); } return $icons; }
public function testEmpty() { $this->assertFalse(XmlParser::getDomDocument('')); $this->assertFalse(XmlParser::getSimpleXml('')); $this->assertNotFalse(XmlParser::getHtmlDocument('')); }
/** * Extract the icon links from the HTML. * * @param string $html HTML * * @return array */ public function extract($html) { $icons = array(); if (empty($html)) { return $icons; } $dom = XmlParser::getHtmlDocument($html); $xpath = new DOMXpath($dom); $elements = $xpath->query('//link[@rel="icon" or @rel="shortcut icon" or @rel="icon shortcut"]'); for ($i = 0; $i < $elements->length; ++$i) { $icons[] = $elements->item($i)->getAttribute('href'); } return $icons; }