Пример #1
0
 /**
  * Initialize the filter, all inputs data must be encoded in UTF-8 before.
  *
  * @param string $html    HTML content
  * @param string $website Site URL (used to build absolute URL)
  */
 public function __construct($html, $website)
 {
     $this->config = new Config();
     $this->input = XmlParser::htmlToXml($html);
     $this->output = '';
     $this->tag = new Tag($this->config);
     $this->website = $website;
     $this->attribute = new Attribute(new Url($website));
 }
Пример #2
0
 /**
  * Parse the OPML file.
  *
  * @return array|false
  */
 public function execute()
 {
     Logger::setMessage(get_called_class() . ': start importation');
     $xml = XmlParser::getSimpleXml(trim($this->content));
     if ($xml === false || $xml->getName() !== 'opml' || !isset($xml->body)) {
         Logger::setMessage(get_called_class() . ': OPML tag not found or malformed XML document');
         return false;
     }
     $this->parseEntries($xml->body);
     Logger::setMessage(get_called_class() . ': ' . count($this->items) . ' subscriptions found');
     return $this->items;
 }
Пример #3
0
 /**
  * Constructor.
  *
  * @param string $html
  * @param array  $rules
  */
 public function __construct($html, array $rules)
 {
     $this->rules = $rules;
     $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">' . $html);
     $this->xpath = new DOMXPath($this->dom);
 }
Пример #4
0
 /**
  * Detect the feed format.
  *
  * @param string $content Feed content
  *
  * @return string
  */
 public function detectFormat($content)
 {
     $dom = XmlParser::getHtmlDocument($content);
     $xpath = new DOMXPath($dom);
     foreach ($this->formats as $parser_name => $query) {
         $nodes = $xpath->query($query);
         if ($nodes->length === 1) {
             return $parser_name;
         }
     }
     return '';
 }
Пример #5
0
 /**
  * Remove script tags.
  *
  * @param string $data Input data
  *
  * @return string
  */
 public function removeBlacklistedTags($data)
 {
     $dom = XmlParser::getDomDocument($data);
     if ($dom === false) {
         return '';
     }
     $xpath = new DOMXpath($dom);
     $nodes = $xpath->query(implode(' | ', $this->tag_blacklist));
     foreach ($nodes as $node) {
         $node->parentNode->removeChild($node);
     }
     return $dom->saveXML();
 }
Пример #6
0
 /**
  * Parse the document.
  *
  * @return \AsteFeed\Parser\Feed
  */
 public function execute()
 {
     Logger::setMessage(get_called_class() . ': begin parsing');
     $xml = XmlParser::getSimpleXml($this->content);
     if ($xml === false) {
         Logger::setMessage(get_called_class() . ': Applying XML workarounds');
         $this->content = Filter::normalizeData($this->content);
         $xml = XmlParser::getSimpleXml($this->content);
         if ($xml === false) {
             Logger::setMessage(get_called_class() . ': XML parsing error');
             Logger::setMessage(XmlParser::getErrors());
             throw new MalformedXmlException('XML parsing error');
         }
     }
     $this->used_namespaces = $xml->getNamespaces(true);
     $xml = $this->registerSupportedNamespaces($xml);
     $feed = new Feed();
     $this->findFeedUrl($xml, $feed);
     $this->checkFeedUrl($feed);
     $this->findSiteUrl($xml, $feed);
     $this->checkSiteUrl($feed);
     $this->findFeedTitle($xml, $feed);
     $this->findFeedDescription($xml, $feed);
     $this->findFeedLanguage($xml, $feed);
     $this->findFeedId($xml, $feed);
     $this->findFeedDate($xml, $feed);
     $this->findFeedLogo($xml, $feed);
     $this->findFeedIcon($xml, $feed);
     foreach ($this->getItemsTree($xml) as $entry) {
         $entry = $this->registerSupportedNamespaces($entry);
         $item = new Item();
         $item->xml = $entry;
         $item->namespaces = $this->used_namespaces;
         $this->findItemAuthor($xml, $entry, $item);
         $this->findItemUrl($entry, $item);
         $this->checkItemUrl($feed, $item);
         $this->findItemTitle($entry, $item);
         $this->findItemContent($entry, $item);
         // Id generation can use the item url/title/content (order is important)
         $this->findItemId($entry, $item, $feed);
         $this->findItemDate($entry, $item, $feed);
         $this->findItemEnclosure($entry, $item, $feed);
         $this->findItemLanguage($entry, $item, $feed);
         $this->findItemMedia($entry, $item, $feed);
         // Order is important (avoid double filtering)
         $this->filterItemContent($feed, $item);
         $this->scrapWebsite($item);
         $feed->items[] = $item;
     }
     Logger::setMessage(get_called_class() . PHP_EOL . $feed);
     return $feed;
 }
Пример #7
0
 /**
  * Get the entry content.
  *
  * @param SimpleXMLElement $entry XML Entry
  *
  * @return string
  */
 private function getContent(SimpleXMLElement $entry)
 {
     $content = current(XmlParser::getXPathResult($entry, 'atom:content', $this->namespaces) ?: XmlParser::getXPathResult($entry, 'content'));
     if (!empty($content) && count($content->children())) {
         $xml_string = '';
         foreach ($content->children() as $child) {
             $xml_string .= $child->asXML();
         }
         return $xml_string;
     } elseif (trim((string) $content) !== '') {
         return (string) $content;
     }
     $summary = XmlParser::getXPathResult($entry, 'atom:summary', $this->namespaces) ?: XmlParser::getXPathResult($entry, 'summary');
     return (string) current($summary);
 }
Пример #8
0
 /**
  * Find the item language.
  *
  * @param SimpleXMLElement      $entry Feed item
  * @param \AsteFeed\Parser\Item $item  Item object
  * @param \AsteFeed\Parser\Feed $feed  Feed object
  */
 public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
 {
     $language = XmlParser::getXPathResult($entry, 'dc:language', $this->namespaces);
     $item->language = (string) current($language) ?: $feed->language;
 }
Пример #9
0
 /**
  * Normalize encoding and strip head tag.
  */
 public function prepareHtml()
 {
     $html_encoding = XmlParser::getEncodingFromMetaTag($this->html);
     $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding);
     $this->html = Filter::stripHeadTags($this->html);
     Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $this->encoding . '" ; HTML Encoding "' . $html_encoding . '"');
 }
Пример #10
0
 /**
  * Extract the icon links from the HTML.
  *
  * @param string $html HTML
  *
  * @return array
  */
 public function extract($html)
 {
     $icons = array();
     if (empty($html)) {
         return $icons;
     }
     $dom = XmlParser::getHtmlDocument($html);
     $xpath = new DOMXpath($dom);
     $elements = $xpath->query("//link[contains(@rel, 'icon') and not(contains(@rel, 'apple'))]");
     for ($i = 0; $i < $elements->length; ++$i) {
         $icons[] = $elements->item($i)->getAttribute('href');
     }
     return $icons;
 }
Пример #11
0
 /**
  * Find the item media.
  *
  * @param SimpleXMLElement      $entry Feed item
  * @param \AsteFeed\Parser\Item $item  Item object
  * @param \AsteFeed\Parser\Feed $feed  Feed object
  */
 public function findItemMedia(SimpleXMLElement $entry, Item $item, Feed $feed)
 {
     $item->media = new Media();
     $mediaTags = XmlParser::getXPathResult($entry, 'media:*', $this->namespaces);
     foreach ($mediaTags as $mediaTag) {
         $name = $mediaTag->getName();
         $array = [];
         foreach ($mediaTag->attributes() as $key => $value) {
             $array[$key] = (string) $value;
         }
         if (strlen($mediaTag) > 0) {
             $array['content'];
         }
         if ($name == "thumbnail") {
             $item->media->thumbnails[] = $array;
         } else {
             $item->media->{$name} = $array;
         }
     }
 }
Пример #12
0
 /**
  * Strip useless tags.
  *
  * @param string $content
  *
  * @return string
  */
 public function stripGarbage($content)
 {
     $dom = XmlParser::getDomDocument($content);
     if ($dom !== false) {
         $xpath = new DOMXPath($dom);
         $this->stripTags($xpath);
         $this->stripAttributes($dom, $xpath);
         $content = $dom->saveXML($dom->documentElement);
     }
     return $content;
 }