Example #1
0
 /**
  * Initialize the filter, all inputs data must be encoded in UTF-8 before
  *
  * @access public
  * @param  string  $html      HTML content
  * @param  string  $website   Site URL (used to build absolute URL)
  */
 public function __construct($html, $website)
 {
     $this->input = XmlParser::HtmlToXml($html);
     $this->output = '';
     $this->tag = new Tag();
     $this->attribute = new Attribute(new Url($website));
 }
Example #2
0
 public function testNamespaceValue()
 {
     $xml = XmlParser::getSimpleXml(file_get_contents('tests/fixtures/rue89.xml'));
     $this->assertNotFalse($xml);
     $namespaces = $xml->getNamespaces(true);
     $parser = new Rss20('');
     $this->assertEquals('Blandine Grosjean', XmlParser::getNamespaceValue($xml->channel->item[0], $namespaces, 'creator'));
     $this->assertEquals('Pierre-Carl Langlais', XmlParser::getNamespaceValue($xml->channel->item[1], $namespaces, 'creator'));
 }
 /**
  * Parse a subscription list entry
  *
  * @access public
  * @throws MalformedXmlException
  * @return SubscriptionList
  */
 public function parse()
 {
     $xml = XmlParser::getSimpleXml($this->data);
     if (!$xml || !isset($xml->head) || !isset($xml->body)) {
         throw new MalformedXmlException('Unable to parse OPML file: invalid XML');
     }
     $this->parseTitle($xml->head);
     $this->parseEntries($xml->body);
     return $this->subscriptionList;
 }
Example #4
0
 /**
  * Parse the OPML file.
  *
  * @return array|false
  */
 public function execute()
 {
     Logger::setMessage(get_called_class() . ': start importation');
     $xml = XmlParser::getSimpleXml(trim($this->content));
     if ($xml === false || $xml->getName() !== 'opml' || !isset($xml->body)) {
         Logger::setMessage(get_called_class() . ': OPML tag not found or malformed XML document');
         return false;
     }
     $this->parseEntries($xml->body);
     Logger::setMessage(get_called_class() . ': ' . count($this->items) . ' subscriptions found');
     return $this->items;
 }
Example #5
0
 /**
  * Find the item language.
  *
  * @param SimpleXMLElement      $entry Feed item
  * @param \PicoFeed\Parser\Item $item  Item object
  * @param \PicoFeed\Parser\Feed $feed  Feed object
  */
 public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
 {
     $language = XmlParser::getXPathResult($entry, 'dc:language', $this->namespaces);
     $item->language = (string) current($language) ?: $feed->language;
 }
Example #6
0
 /**
  * Find the item enclosure
  *
  * @access public
  * @param  SimpleXMLElement          $entry   Feed item
  * @param  \PicoFeed\Parser\Item     $item    Item object
  * @param  \PicoFeed\Parser\Feed     $feed    Feed object
  */
 public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
 {
     if (isset($entry->enclosure)) {
         $item->enclosure_url = XmlParser::getNamespaceValue($entry->enclosure, $this->namespaces, 'origEnclosureLink');
         if (empty($item->enclosure_url)) {
             $item->enclosure_url = isset($entry->enclosure['url']) ? (string) $entry->enclosure['url'] : '';
         }
         $item->enclosure_type = isset($entry->enclosure['type']) ? (string) $entry->enclosure['type'] : '';
         $item->enclosure_url = Url::resolve($item->enclosure_url, $feed->getSiteUrl());
     }
 }
Example #7
0
 /**
  * Parse the document.
  *
  * @return \PicoFeed\Parser\Feed
  */
 public function execute()
 {
     Logger::setMessage(get_called_class() . ': begin parsing');
     $xml = XmlParser::getSimpleXml($this->content);
     if ($xml === false) {
         Logger::setMessage(get_called_class() . ': Applying XML workarounds');
         $this->content = Filter::normalizeData($this->content);
         $xml = XmlParser::getSimpleXml($this->content);
         if ($xml === false) {
             Logger::setMessage(get_called_class() . ': XML parsing error');
             Logger::setMessage(XmlParser::getErrors());
             throw new MalformedXmlException('XML parsing error');
         }
     }
     $this->used_namespaces = $xml->getNamespaces(true);
     $xml = $this->registerSupportedNamespaces($xml);
     $feed = new Feed();
     $this->findFeedUrl($xml, $feed);
     $this->checkFeedUrl($feed);
     $this->findSiteUrl($xml, $feed);
     $this->checkSiteUrl($feed);
     $this->findFeedTitle($xml, $feed);
     $this->findFeedDescription($xml, $feed);
     $this->findFeedLanguage($xml, $feed);
     $this->findFeedId($xml, $feed);
     $this->findFeedDate($xml, $feed);
     $this->findFeedLogo($xml, $feed);
     $this->findFeedIcon($xml, $feed);
     foreach ($this->getItemsTree($xml) as $entry) {
         $entry = $this->registerSupportedNamespaces($entry);
         $item = new Item();
         $item->xml = $entry;
         $item->namespaces = $this->used_namespaces;
         $this->findItemAuthor($xml, $entry, $item);
         $this->findItemUrl($entry, $item);
         $this->checkItemUrl($feed, $item);
         $this->findItemTitle($entry, $item);
         $this->findItemContent($entry, $item);
         // Id generation can use the item url/title/content (order is important)
         $this->findItemId($entry, $item, $feed);
         $this->findItemDate($entry, $item, $feed);
         $this->findItemEnclosure($entry, $item, $feed);
         $this->findItemLanguage($entry, $item, $feed);
         $this->itemPostProcessor->execute($feed, $item);
         $feed->items[] = $item;
     }
     Logger::setMessage(get_called_class() . PHP_EOL . $feed);
     return $feed;
 }
Example #8
0
 /**
  * Find the feed language
  *
  * @access public
  * @param  SimpleXMLElement          $xml     Feed xml
  * @param  \PicoFeed\Parser\Feed     $feed    Feed object
  */
 public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
 {
     $feed->language = XmlParser::getXmlLang($this->content);
 }
Example #9
0
 public function testReplaceXPathPrefixWithNamespaceURI()
 {
     $ns = array('lorem' => 'https://en.wikipedia.org/wiki/Lorem');
     $query = '//lorem:title';
     $expected = '//*[namespace-uri()="https://en.wikipedia.org/wiki/Lorem" and local-name()="title"]';
     $this->assertEquals($expected, XmlParser::replaceXPathPrefixWithNamespaceURI($query, $ns));
     $ns = array('lorem' => 'https://en.wikipedia.org/wiki/Lorem', 'ipsum' => 'https://en.wikipedia.org/wiki/Ipsum');
     $query = '//lorem:title/ipsum:name';
     $expected = '//*[namespace-uri()="https://en.wikipedia.org/wiki/Lorem" and local-name()="title"]/*[namespace-uri()="https://en.wikipedia.org/wiki/Ipsum" and local-name()="name"]';
     $this->assertEquals($expected, XmlParser::replaceXPathPrefixWithNamespaceURI($query, $ns));
     $ns = array('lorem' => 'https://en.wikipedia.org/wiki/Lorem', 'ipsum' => 'https://en.wikipedia.org/wiki/Ipsum');
     $query = '//lorem:title/ipsum:name/@xml:lang';
     $expected = '//*[namespace-uri()="https://en.wikipedia.org/wiki/Lorem" and local-name()="title"]/*[namespace-uri()="https://en.wikipedia.org/wiki/Ipsum" and local-name()="name"]/@xml:lang';
     $this->assertEquals($expected, XmlParser::replaceXPathPrefixWithNamespaceURI($query, $ns));
 }
    public function testScanXmlWithDTD()
    {
        $xml = <<<XML
<?xml version="1.0"?>
<!DOCTYPE results [
<!ELEMENT results (result+)>
<!ELEMENT result (#PCDATA)>
]>
<results>
    <result>test</result>
</results>
XML;
        $result = XmlParser::getDomDocument($xml);
        $this->assertTrue($result instanceof DOMDocument);
        $this->assertTrue($result->validate());
    }
Example #11
0
 /**
  * Constructor.
  *
  * @param string $html        	
  * @param array $rules        	
  */
 public function __construct($html, array $rules)
 {
     $this->rules = $rules;
     $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">' . $html);
     $this->xpath = new DOMXPath($this->dom);
 }
Example #12
0
 /**
  * Extract the icon links from the HTML.
  *
  * @param string $html HTML
  *
  * @return array
  */
 public function extract($html)
 {
     $icons = array();
     if (empty($html)) {
         return $icons;
     }
     $dom = XmlParser::getHtmlDocument($html);
     $xpath = new DOMXpath($dom);
     $elements = $xpath->query('//link[@rel="icon" or @rel="shortcut icon" or @rel="icon shortcut"]');
     for ($i = 0; $i < $elements->length; ++$i) {
         $icons[] = $elements->item($i)->getAttribute('href');
     }
     return $icons;
 }
Example #13
0
 /**
  * Strip useless tags.
  *
  * @param string $content
  *
  * @return string
  */
 public function stripGarbage($content)
 {
     $dom = XmlParser::getDomDocument($content);
     if ($dom !== false) {
         $xpath = new DOMXPath($dom);
         $this->stripTags($xpath);
         $this->stripAttributes($dom, $xpath);
         $content = $dom->saveXML($dom->documentElement);
     }
     return $content;
 }
Example #14
0
 /**
  * Detect the feed format
  *
  * @access public
  * @param  string    $content     Feed content
  * @return string
  */
 public function detectFormat($content)
 {
     $dom = XmlParser::getHtmlDocument($content);
     $xpath = new DOMXPath($dom);
     foreach ($this->formats as $parser_name => $query) {
         $nodes = $xpath->query($query);
         if ($nodes->length === 1) {
             return $parser_name;
         }
     }
     return '';
 }
Example #15
0
 /**
  * Find the item language.
  *
  * @param SimpleXMLElement      $entry Feed item
  * @param \PicoFeed\Parser\Item $item  Item object
  * @param \PicoFeed\Parser\Feed $feed  Feed object
  */
 public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
 {
     $language = XmlParser::getXPathResult($entry, 'dc:language', $this->namespaces);
     $item->setLanguage(XmlParser::getValue($language) ?: $feed->getLanguage());
 }
Example #16
0
 /**
  * Normalize encoding and strip head tag.
  */
 public function prepareHtml()
 {
     $html_encoding = XmlParser::getEncodingFromMetaTag($this->html);
     $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding);
     $this->html = Filter::stripHeadTags($this->html);
     Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $this->encoding . '" ; HTML Encoding "' . $html_encoding . '"');
 }
Example #17
0
 /**
  * Strip useless tags
  *
  * @access public
  */
 public function stripGarbage()
 {
     $dom = XmlParser::getDomDocument($this->content);
     if ($dom !== false) {
         $xpath = new DOMXPath($dom);
         foreach ($this->stripTags as $tag) {
             $nodes = $xpath->query('//' . $tag);
             if ($nodes !== false && $nodes->length > 0) {
                 Logger::setMessage(get_called_class() . ': Strip tag: "' . $tag . '"');
                 foreach ($nodes as $node) {
                     $node->parentNode->removeChild($node);
                 }
             }
         }
         foreach ($this->stripAttributes as $attribute) {
             $nodes = $xpath->query('//*[contains(@class, "' . $attribute . '") or contains(@id, "' . $attribute . '")]');
             if ($nodes !== false && $nodes->length > 0) {
                 Logger::setMessage(get_called_class() . ': Strip attribute: "' . $attribute . '"');
                 foreach ($nodes as $node) {
                     if ($this->shouldRemove($dom, $node)) {
                         $node->parentNode->removeChild($node);
                     }
                 }
             }
         }
         $this->content = $dom->saveXML($dom->documentElement);
     }
 }
Example #18
0
 /**
  * Extract the icon links from the HTML.
  *
  * @param string $html
  *        	HTML
  *        	
  * @return array
  */
 public function extract($html)
 {
     $icons = array();
     if (empty($html)) {
         return $icons;
     }
     $dom = XmlParser::getHtmlDocument($html);
     $xpath = new DOMXpath($dom);
     $elements = $xpath->query("//link[contains(@rel, 'icon') and not(contains(@rel, 'apple'))]");
     for ($i = 0; $i < $elements->length; ++$i) {
         $icons[] = $elements->item($i)->getAttribute('href');
     }
     return $icons;
 }
Example #19
0
 /**
  * Parse the document
  *
  * @access public
  * @return \PicoFeed\Parser\Feed
  */
 public function execute()
 {
     Logger::setMessage(get_called_class() . ': begin parsing');
     $xml = XmlParser::getSimpleXml($this->content);
     if ($xml === false) {
         Logger::setMessage(get_called_class() . ': XML parsing error');
         Logger::setMessage(XmlParser::getErrors());
         throw new MalformedXmlException('XML parsing error');
     }
     $this->namespaces = $xml->getNamespaces(true);
     $feed = new Feed();
     $this->findFeedUrl($xml, $feed);
     $this->checkFeedUrl($feed);
     $this->findSiteUrl($xml, $feed);
     $this->checkSiteUrl($feed);
     $this->findFeedTitle($xml, $feed);
     $this->findFeedDescription($xml, $feed);
     $this->findFeedLanguage($xml, $feed);
     $this->findFeedId($xml, $feed);
     $this->findFeedDate($xml, $feed);
     $this->findFeedLogo($xml, $feed);
     $this->findFeedIcon($xml, $feed);
     foreach ($this->getItemsTree($xml) as $entry) {
         $item = new Item();
         $this->findItemAuthor($xml, $entry, $item);
         $this->findItemUrl($entry, $item);
         $this->checkItemUrl($feed, $item);
         $this->findItemTitle($entry, $item);
         $this->findItemContent($entry, $item);
         // Id generation can use the item url/title/content (order is important)
         $this->findItemId($entry, $item, $feed);
         $this->findItemDate($entry, $item);
         $this->findItemEnclosure($entry, $item, $feed);
         $this->findItemLanguage($entry, $item, $feed);
         // Order is important (avoid double filtering)
         $this->filterItemContent($feed, $item);
         $this->scrapWebsite($item);
         $feed->items[] = $item;
     }
     Logger::setMessage(get_called_class() . PHP_EOL . $feed);
     return $feed;
 }
Example #20
0
 /**
  * Find the feed language
  *
  * @access public
  * @param  SimpleXMLElement   $xml     Feed xml
  * @param  \PicoFeed\Parser\Feed     $feed    Feed object
  */
 public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
 {
     $feed->language = XmlParser::getNamespaceValue($xml->channel, $this->namespaces, 'language');
 }
Example #21
0
 /**
  * Remove script tags
  *
  * @access public
  * @param  string  $data  Input data
  * @return string
  */
 public function removeBlacklistedTags($data)
 {
     $dom = XmlParser::getDomDocument($data);
     if ($dom === false) {
         return '';
     }
     $xpath = new DOMXpath($dom);
     $nodes = $xpath->query(implode(' | ', $this->tag_blacklist));
     foreach ($nodes as $node) {
         $node->parentNode->removeChild($node);
     }
     return $dom->saveXML();
 }
Example #22
0
 /**
  * Get the entry content
  *
  * @access private
  * @param  SimpleXMLElement   $entry   XML Entry
  * @return string
  */
 private function getContent(SimpleXMLElement $entry)
 {
     $content = current(XmlParser::getXPathResult($entry, 'atom:content', $this->namespaces) ?: XmlParser::getXPathResult($entry, 'content'));
     if (!empty($content) && count($content->children())) {
         $xml_string = '';
         foreach ($content->children() as $child) {
             $xml_string .= $child->asXML();
         }
         return $xml_string;
     } else {
         if (trim((string) $content) !== '') {
             return (string) $content;
         }
     }
     $summary = XmlParser::getXPathResult($entry, 'atom:summary', $this->namespaces) ?: XmlParser::getXPathResult($entry, 'summary');
     return (string) current($summary);
 }