/** * Initialize the filter, all inputs data must be encoded in UTF-8 before * * @access public * @param string $html HTML content * @param string $website Site URL (used to build absolute URL) */ public function __construct($html, $website) { $this->input = XmlParser::HtmlToXml($html); $this->output = ''; $this->tag = new Tag(); $this->attribute = new Attribute(new Url($website)); }
public function testNamespaceValue() { $xml = XmlParser::getSimpleXml(file_get_contents('tests/fixtures/rue89.xml')); $this->assertNotFalse($xml); $namespaces = $xml->getNamespaces(true); $parser = new Rss20(''); $this->assertEquals('Blandine Grosjean', XmlParser::getNamespaceValue($xml->channel->item[0], $namespaces, 'creator')); $this->assertEquals('Pierre-Carl Langlais', XmlParser::getNamespaceValue($xml->channel->item[1], $namespaces, 'creator')); }
/** * Parse a subscription list entry * * @access public * @throws MalformedXmlException * @return SubscriptionList */ public function parse() { $xml = XmlParser::getSimpleXml($this->data); if (!$xml || !isset($xml->head) || !isset($xml->body)) { throw new MalformedXmlException('Unable to parse OPML file: invalid XML'); } $this->parseTitle($xml->head); $this->parseEntries($xml->body); return $this->subscriptionList; }
/** * Parse the OPML file. * * @return array|false */ public function execute() { Logger::setMessage(get_called_class() . ': start importation'); $xml = XmlParser::getSimpleXml(trim($this->content)); if ($xml === false || $xml->getName() !== 'opml' || !isset($xml->body)) { Logger::setMessage(get_called_class() . ': OPML tag not found or malformed XML document'); return false; } $this->parseEntries($xml->body); Logger::setMessage(get_called_class() . ': ' . count($this->items) . ' subscriptions found'); return $this->items; }
/** * Find the item language. * * @param SimpleXMLElement $entry Feed item * @param \PicoFeed\Parser\Item $item Item object * @param \PicoFeed\Parser\Feed $feed Feed object */ public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed) { $language = XmlParser::getXPathResult($entry, 'dc:language', $this->namespaces); $item->language = (string) current($language) ?: $feed->language; }
/** * Find the item enclosure * * @access public * @param SimpleXMLElement $entry Feed item * @param \PicoFeed\Parser\Item $item Item object * @param \PicoFeed\Parser\Feed $feed Feed object */ public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed) { if (isset($entry->enclosure)) { $item->enclosure_url = XmlParser::getNamespaceValue($entry->enclosure, $this->namespaces, 'origEnclosureLink'); if (empty($item->enclosure_url)) { $item->enclosure_url = isset($entry->enclosure['url']) ? (string) $entry->enclosure['url'] : ''; } $item->enclosure_type = isset($entry->enclosure['type']) ? (string) $entry->enclosure['type'] : ''; $item->enclosure_url = Url::resolve($item->enclosure_url, $feed->getSiteUrl()); } }
/** * Parse the document. * * @return \PicoFeed\Parser\Feed */ public function execute() { Logger::setMessage(get_called_class() . ': begin parsing'); $xml = XmlParser::getSimpleXml($this->content); if ($xml === false) { Logger::setMessage(get_called_class() . ': Applying XML workarounds'); $this->content = Filter::normalizeData($this->content); $xml = XmlParser::getSimpleXml($this->content); if ($xml === false) { Logger::setMessage(get_called_class() . ': XML parsing error'); Logger::setMessage(XmlParser::getErrors()); throw new MalformedXmlException('XML parsing error'); } } $this->used_namespaces = $xml->getNamespaces(true); $xml = $this->registerSupportedNamespaces($xml); $feed = new Feed(); $this->findFeedUrl($xml, $feed); $this->checkFeedUrl($feed); $this->findSiteUrl($xml, $feed); $this->checkSiteUrl($feed); $this->findFeedTitle($xml, $feed); $this->findFeedDescription($xml, $feed); $this->findFeedLanguage($xml, $feed); $this->findFeedId($xml, $feed); $this->findFeedDate($xml, $feed); $this->findFeedLogo($xml, $feed); $this->findFeedIcon($xml, $feed); foreach ($this->getItemsTree($xml) as $entry) { $entry = $this->registerSupportedNamespaces($entry); $item = new Item(); $item->xml = $entry; $item->namespaces = $this->used_namespaces; $this->findItemAuthor($xml, $entry, $item); $this->findItemUrl($entry, $item); $this->checkItemUrl($feed, $item); $this->findItemTitle($entry, $item); $this->findItemContent($entry, $item); // Id generation can use the item url/title/content (order is important) $this->findItemId($entry, $item, $feed); $this->findItemDate($entry, $item, $feed); $this->findItemEnclosure($entry, $item, $feed); $this->findItemLanguage($entry, $item, $feed); $this->itemPostProcessor->execute($feed, $item); $feed->items[] = $item; } Logger::setMessage(get_called_class() . PHP_EOL . $feed); return $feed; }
/** * Find the feed language * * @access public * @param SimpleXMLElement $xml Feed xml * @param \PicoFeed\Parser\Feed $feed Feed object */ public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed) { $feed->language = XmlParser::getXmlLang($this->content); }
public function testReplaceXPathPrefixWithNamespaceURI() { $ns = array('lorem' => 'https://en.wikipedia.org/wiki/Lorem'); $query = '//lorem:title'; $expected = '//*[namespace-uri()="https://en.wikipedia.org/wiki/Lorem" and local-name()="title"]'; $this->assertEquals($expected, XmlParser::replaceXPathPrefixWithNamespaceURI($query, $ns)); $ns = array('lorem' => 'https://en.wikipedia.org/wiki/Lorem', 'ipsum' => 'https://en.wikipedia.org/wiki/Ipsum'); $query = '//lorem:title/ipsum:name'; $expected = '//*[namespace-uri()="https://en.wikipedia.org/wiki/Lorem" and local-name()="title"]/*[namespace-uri()="https://en.wikipedia.org/wiki/Ipsum" and local-name()="name"]'; $this->assertEquals($expected, XmlParser::replaceXPathPrefixWithNamespaceURI($query, $ns)); $ns = array('lorem' => 'https://en.wikipedia.org/wiki/Lorem', 'ipsum' => 'https://en.wikipedia.org/wiki/Ipsum'); $query = '//lorem:title/ipsum:name/@xml:lang'; $expected = '//*[namespace-uri()="https://en.wikipedia.org/wiki/Lorem" and local-name()="title"]/*[namespace-uri()="https://en.wikipedia.org/wiki/Ipsum" and local-name()="name"]/@xml:lang'; $this->assertEquals($expected, XmlParser::replaceXPathPrefixWithNamespaceURI($query, $ns)); }
public function testScanXmlWithDTD() { $xml = <<<XML <?xml version="1.0"?> <!DOCTYPE results [ <!ELEMENT results (result+)> <!ELEMENT result (#PCDATA)> ]> <results> <result>test</result> </results> XML; $result = XmlParser::getDomDocument($xml); $this->assertTrue($result instanceof DOMDocument); $this->assertTrue($result->validate()); }
/** * Constructor. * * @param string $html * @param array $rules */ public function __construct($html, array $rules) { $this->rules = $rules; $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">' . $html); $this->xpath = new DOMXPath($this->dom); }
/** * Extract the icon links from the HTML. * * @param string $html HTML * * @return array */ public function extract($html) { $icons = array(); if (empty($html)) { return $icons; } $dom = XmlParser::getHtmlDocument($html); $xpath = new DOMXpath($dom); $elements = $xpath->query('//link[@rel="icon" or @rel="shortcut icon" or @rel="icon shortcut"]'); for ($i = 0; $i < $elements->length; ++$i) { $icons[] = $elements->item($i)->getAttribute('href'); } return $icons; }
/** * Strip useless tags. * * @param string $content * * @return string */ public function stripGarbage($content) { $dom = XmlParser::getDomDocument($content); if ($dom !== false) { $xpath = new DOMXPath($dom); $this->stripTags($xpath); $this->stripAttributes($dom, $xpath); $content = $dom->saveXML($dom->documentElement); } return $content; }
/** * Detect the feed format * * @access public * @param string $content Feed content * @return string */ public function detectFormat($content) { $dom = XmlParser::getHtmlDocument($content); $xpath = new DOMXPath($dom); foreach ($this->formats as $parser_name => $query) { $nodes = $xpath->query($query); if ($nodes->length === 1) { return $parser_name; } } return ''; }
/** * Find the item language. * * @param SimpleXMLElement $entry Feed item * @param \PicoFeed\Parser\Item $item Item object * @param \PicoFeed\Parser\Feed $feed Feed object */ public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed) { $language = XmlParser::getXPathResult($entry, 'dc:language', $this->namespaces); $item->setLanguage(XmlParser::getValue($language) ?: $feed->getLanguage()); }
/** * Normalize encoding and strip head tag. */ public function prepareHtml() { $html_encoding = XmlParser::getEncodingFromMetaTag($this->html); $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding); $this->html = Filter::stripHeadTags($this->html); Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $this->encoding . '" ; HTML Encoding "' . $html_encoding . '"'); }
/** * Strip useless tags * * @access public */ public function stripGarbage() { $dom = XmlParser::getDomDocument($this->content); if ($dom !== false) { $xpath = new DOMXPath($dom); foreach ($this->stripTags as $tag) { $nodes = $xpath->query('//' . $tag); if ($nodes !== false && $nodes->length > 0) { Logger::setMessage(get_called_class() . ': Strip tag: "' . $tag . '"'); foreach ($nodes as $node) { $node->parentNode->removeChild($node); } } } foreach ($this->stripAttributes as $attribute) { $nodes = $xpath->query('//*[contains(@class, "' . $attribute . '") or contains(@id, "' . $attribute . '")]'); if ($nodes !== false && $nodes->length > 0) { Logger::setMessage(get_called_class() . ': Strip attribute: "' . $attribute . '"'); foreach ($nodes as $node) { if ($this->shouldRemove($dom, $node)) { $node->parentNode->removeChild($node); } } } } $this->content = $dom->saveXML($dom->documentElement); } }
/** * Extract the icon links from the HTML. * * @param string $html * HTML * * @return array */ public function extract($html) { $icons = array(); if (empty($html)) { return $icons; } $dom = XmlParser::getHtmlDocument($html); $xpath = new DOMXpath($dom); $elements = $xpath->query("//link[contains(@rel, 'icon') and not(contains(@rel, 'apple'))]"); for ($i = 0; $i < $elements->length; ++$i) { $icons[] = $elements->item($i)->getAttribute('href'); } return $icons; }
/** * Parse the document * * @access public * @return \PicoFeed\Parser\Feed */ public function execute() { Logger::setMessage(get_called_class() . ': begin parsing'); $xml = XmlParser::getSimpleXml($this->content); if ($xml === false) { Logger::setMessage(get_called_class() . ': XML parsing error'); Logger::setMessage(XmlParser::getErrors()); throw new MalformedXmlException('XML parsing error'); } $this->namespaces = $xml->getNamespaces(true); $feed = new Feed(); $this->findFeedUrl($xml, $feed); $this->checkFeedUrl($feed); $this->findSiteUrl($xml, $feed); $this->checkSiteUrl($feed); $this->findFeedTitle($xml, $feed); $this->findFeedDescription($xml, $feed); $this->findFeedLanguage($xml, $feed); $this->findFeedId($xml, $feed); $this->findFeedDate($xml, $feed); $this->findFeedLogo($xml, $feed); $this->findFeedIcon($xml, $feed); foreach ($this->getItemsTree($xml) as $entry) { $item = new Item(); $this->findItemAuthor($xml, $entry, $item); $this->findItemUrl($entry, $item); $this->checkItemUrl($feed, $item); $this->findItemTitle($entry, $item); $this->findItemContent($entry, $item); // Id generation can use the item url/title/content (order is important) $this->findItemId($entry, $item, $feed); $this->findItemDate($entry, $item); $this->findItemEnclosure($entry, $item, $feed); $this->findItemLanguage($entry, $item, $feed); // Order is important (avoid double filtering) $this->filterItemContent($feed, $item); $this->scrapWebsite($item); $feed->items[] = $item; } Logger::setMessage(get_called_class() . PHP_EOL . $feed); return $feed; }
/** * Find the feed language * * @access public * @param SimpleXMLElement $xml Feed xml * @param \PicoFeed\Parser\Feed $feed Feed object */ public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed) { $feed->language = XmlParser::getNamespaceValue($xml->channel, $this->namespaces, 'language'); }
/** * Remove script tags * * @access public * @param string $data Input data * @return string */ public function removeBlacklistedTags($data) { $dom = XmlParser::getDomDocument($data); if ($dom === false) { return ''; } $xpath = new DOMXpath($dom); $nodes = $xpath->query(implode(' | ', $this->tag_blacklist)); foreach ($nodes as $node) { $node->parentNode->removeChild($node); } return $dom->saveXML(); }
/** * Get the entry content * * @access private * @param SimpleXMLElement $entry XML Entry * @return string */ private function getContent(SimpleXMLElement $entry) { $content = current(XmlParser::getXPathResult($entry, 'atom:content', $this->namespaces) ?: XmlParser::getXPathResult($entry, 'content')); if (!empty($content) && count($content->children())) { $xml_string = ''; foreach ($content->children() as $child) { $xml_string .= $child->asXML(); } return $xml_string; } else { if (trim((string) $content) !== '') { return (string) $content; } } $summary = XmlParser::getXPathResult($entry, 'atom:summary', $this->namespaces) ?: XmlParser::getXPathResult($entry, 'summary'); return (string) current($summary); }