/** * Initialize the filter, all inputs data must be encoded in UTF-8 before. * * @param string $html HTML content * @param string $website Site URL (used to build absolute URL) */ public function __construct($html, $website) { $this->config = new Config(); $this->input = XmlParser::htmlToXml($html); $this->output = ''; $this->tag = new Tag($this->config); $this->website = $website; $this->attribute = new Attribute(new Url($website)); }
/** * Parse the OPML file. * * @return array|false */ public function execute() { Logger::setMessage(get_called_class() . ': start importation'); $xml = XmlParser::getSimpleXml(trim($this->content)); if ($xml === false || $xml->getName() !== 'opml' || !isset($xml->body)) { Logger::setMessage(get_called_class() . ': OPML tag not found or malformed XML document'); return false; } $this->parseEntries($xml->body); Logger::setMessage(get_called_class() . ': ' . count($this->items) . ' subscriptions found'); return $this->items; }
/** * Constructor. * * @param string $html * @param array $rules */ public function __construct($html, array $rules) { $this->rules = $rules; $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">' . $html); $this->xpath = new DOMXPath($this->dom); }
/** * Detect the feed format. * * @param string $content Feed content * * @return string */ public function detectFormat($content) { $dom = XmlParser::getHtmlDocument($content); $xpath = new DOMXPath($dom); foreach ($this->formats as $parser_name => $query) { $nodes = $xpath->query($query); if ($nodes->length === 1) { return $parser_name; } } return ''; }
/** * Remove script tags. * * @param string $data Input data * * @return string */ public function removeBlacklistedTags($data) { $dom = XmlParser::getDomDocument($data); if ($dom === false) { return ''; } $xpath = new DOMXpath($dom); $nodes = $xpath->query(implode(' | ', $this->tag_blacklist)); foreach ($nodes as $node) { $node->parentNode->removeChild($node); } return $dom->saveXML(); }
/** * Parse the document. * * @return \AsteFeed\Parser\Feed */ public function execute() { Logger::setMessage(get_called_class() . ': begin parsing'); $xml = XmlParser::getSimpleXml($this->content); if ($xml === false) { Logger::setMessage(get_called_class() . ': Applying XML workarounds'); $this->content = Filter::normalizeData($this->content); $xml = XmlParser::getSimpleXml($this->content); if ($xml === false) { Logger::setMessage(get_called_class() . ': XML parsing error'); Logger::setMessage(XmlParser::getErrors()); throw new MalformedXmlException('XML parsing error'); } } $this->used_namespaces = $xml->getNamespaces(true); $xml = $this->registerSupportedNamespaces($xml); $feed = new Feed(); $this->findFeedUrl($xml, $feed); $this->checkFeedUrl($feed); $this->findSiteUrl($xml, $feed); $this->checkSiteUrl($feed); $this->findFeedTitle($xml, $feed); $this->findFeedDescription($xml, $feed); $this->findFeedLanguage($xml, $feed); $this->findFeedId($xml, $feed); $this->findFeedDate($xml, $feed); $this->findFeedLogo($xml, $feed); $this->findFeedIcon($xml, $feed); foreach ($this->getItemsTree($xml) as $entry) { $entry = $this->registerSupportedNamespaces($entry); $item = new Item(); $item->xml = $entry; $item->namespaces = $this->used_namespaces; $this->findItemAuthor($xml, $entry, $item); $this->findItemUrl($entry, $item); $this->checkItemUrl($feed, $item); $this->findItemTitle($entry, $item); $this->findItemContent($entry, $item); // Id generation can use the item url/title/content (order is important) $this->findItemId($entry, $item, $feed); $this->findItemDate($entry, $item, $feed); $this->findItemEnclosure($entry, $item, $feed); $this->findItemLanguage($entry, $item, $feed); $this->findItemMedia($entry, $item, $feed); // Order is important (avoid double filtering) $this->filterItemContent($feed, $item); $this->scrapWebsite($item); $feed->items[] = $item; } Logger::setMessage(get_called_class() . PHP_EOL . $feed); return $feed; }
/** * Get the entry content. * * @param SimpleXMLElement $entry XML Entry * * @return string */ private function getContent(SimpleXMLElement $entry) { $content = current(XmlParser::getXPathResult($entry, 'atom:content', $this->namespaces) ?: XmlParser::getXPathResult($entry, 'content')); if (!empty($content) && count($content->children())) { $xml_string = ''; foreach ($content->children() as $child) { $xml_string .= $child->asXML(); } return $xml_string; } elseif (trim((string) $content) !== '') { return (string) $content; } $summary = XmlParser::getXPathResult($entry, 'atom:summary', $this->namespaces) ?: XmlParser::getXPathResult($entry, 'summary'); return (string) current($summary); }
/** * Find the item language. * * @param SimpleXMLElement $entry Feed item * @param \AsteFeed\Parser\Item $item Item object * @param \AsteFeed\Parser\Feed $feed Feed object */ public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed) { $language = XmlParser::getXPathResult($entry, 'dc:language', $this->namespaces); $item->language = (string) current($language) ?: $feed->language; }
/** * Normalize encoding and strip head tag. */ public function prepareHtml() { $html_encoding = XmlParser::getEncodingFromMetaTag($this->html); $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding); $this->html = Filter::stripHeadTags($this->html); Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $this->encoding . '" ; HTML Encoding "' . $html_encoding . '"'); }
/** * Extract the icon links from the HTML. * * @param string $html HTML * * @return array */ public function extract($html) { $icons = array(); if (empty($html)) { return $icons; } $dom = XmlParser::getHtmlDocument($html); $xpath = new DOMXpath($dom); $elements = $xpath->query("//link[contains(@rel, 'icon') and not(contains(@rel, 'apple'))]"); for ($i = 0; $i < $elements->length; ++$i) { $icons[] = $elements->item($i)->getAttribute('href'); } return $icons; }
/** * Find the item media. * * @param SimpleXMLElement $entry Feed item * @param \AsteFeed\Parser\Item $item Item object * @param \AsteFeed\Parser\Feed $feed Feed object */ public function findItemMedia(SimpleXMLElement $entry, Item $item, Feed $feed) { $item->media = new Media(); $mediaTags = XmlParser::getXPathResult($entry, 'media:*', $this->namespaces); foreach ($mediaTags as $mediaTag) { $name = $mediaTag->getName(); $array = []; foreach ($mediaTag->attributes() as $key => $value) { $array[$key] = (string) $value; } if (strlen($mediaTag) > 0) { $array['content']; } if ($name == "thumbnail") { $item->media->thumbnails[] = $array; } else { $item->media->{$name} = $array; } } }
/** * Strip useless tags. * * @param string $content * * @return string */ public function stripGarbage($content) { $dom = XmlParser::getDomDocument($content); if ($dom !== false) { $xpath = new DOMXPath($dom); $this->stripTags($xpath); $this->stripAttributes($dom, $xpath); $content = $dom->saveXML($dom->documentElement); } return $content; }