Exemple #1
0
 /**
  * Constructor.
  *
  * @param string $html        	
  * @param array $rules        	
  */
 public function __construct($html, array $rules)
 {
     $this->rules = $rules;
     $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">' . $html);
     $this->xpath = new DOMXPath($this->dom);
 }
Exemple #2
0
 /**
  * Detect the feed format
  *
  * @access public
  * @param  string    $content     Feed content
  * @return string
  */
 public function detectFormat($content)
 {
     $dom = XmlParser::getHtmlDocument($content);
     $xpath = new DOMXPath($dom);
     foreach ($this->formats as $parser_name => $query) {
         $nodes = $xpath->query($query);
         if ($nodes->length === 1) {
             return $parser_name;
         }
     }
     return '';
 }
Exemple #3
0
 /**
  * Get the relevant content with the list of potential attributes
  *
  * @access public
  */
 public function parseContentWithCandidates()
 {
     $dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">' . $this->html);
     $xpath = new DOMXPath($dom);
     // Try to lookup in each tag
     foreach ($this->candidatesAttributes as $candidate) {
         Logger::setMessage(get_called_class() . ': Try this candidate: "' . $candidate . '"');
         $nodes = $xpath->query('//*[(contains(@class, "' . $candidate . '") or @id="' . $candidate . '") and not (contains(@class, "nav") or contains(@class, "page"))]');
         if ($nodes !== false && $nodes->length > 0) {
             $this->content = $dom->saveXML($nodes->item(0));
             Logger::setMessage(get_called_class() . ': Find candidate "' . $candidate . '" (' . strlen($this->content) . ' bytes)');
             break;
         }
     }
     // Try to fetch <article/>
     if (strlen($this->content) < 200) {
         $nodes = $xpath->query('//article');
         if ($nodes !== false && $nodes->length > 0) {
             $this->content = $dom->saveXML($nodes->item(0));
             Logger::setMessage(get_called_class() . ': Find <article/> tag (' . strlen($this->content) . ' bytes)');
         }
     }
     // Get everything
     if (strlen($this->content) < 50) {
         $nodes = $xpath->query('//body');
         if ($nodes !== false && $nodes->length > 0) {
             Logger::setMessage(get_called_class() . ' No enought content fetched, get //body');
             $this->content = $dom->saveXML($nodes->item(0));
         }
     }
     Logger::setMessage(get_called_class() . ': Strip garbage');
     $this->stripGarbage();
 }
Exemple #4
0
 /**
  * Extract the icon links from the HTML.
  *
  * @param string $html
  *        	HTML
  *        	
  * @return array
  */
 public function extract($html)
 {
     $icons = array();
     if (empty($html)) {
         return $icons;
     }
     $dom = XmlParser::getHtmlDocument($html);
     $xpath = new DOMXpath($dom);
     $elements = $xpath->query("//link[contains(@rel, 'icon') and not(contains(@rel, 'apple'))]");
     for ($i = 0; $i < $elements->length; ++$i) {
         $icons[] = $elements->item($i)->getAttribute('href');
     }
     return $icons;
 }
 public function testEmpty()
 {
     $this->assertFalse(XmlParser::getDomDocument(''));
     $this->assertFalse(XmlParser::getSimpleXml(''));
     $this->assertNotFalse(XmlParser::getHtmlDocument(''));
 }
Exemple #6
0
 /**
  * Extract the icon links from the HTML.
  *
  * @param string $html HTML
  *
  * @return array
  */
 public function extract($html)
 {
     $icons = array();
     if (empty($html)) {
         return $icons;
     }
     $dom = XmlParser::getHtmlDocument($html);
     $xpath = new DOMXpath($dom);
     $elements = $xpath->query('//link[@rel="icon" or @rel="shortcut icon" or @rel="icon shortcut"]');
     for ($i = 0; $i < $elements->length; ++$i) {
         $icons[] = $elements->item($i)->getAttribute('href');
     }
     return $icons;
 }