/** * Execute Item Processor * * @access public * @param Feed $feed * @param Item $item * @return bool */ public function execute(Feed $feed, Item $item) { if ($this->config->getContentFiltering(true)) { $filter = Filter::html($item->getContent(), $feed->getSiteUrl()); $filter->setConfig($this->config); $item->setContent($filter->execute()); } else { Logger::setMessage(get_called_class() . ': Content filtering disabled'); } }
function addProxyToTags($html, $website, $proxy_images, $cloak_referrer) { if ($html === '' || !$cloak_referrer && !$proxy_images || !$cloak_referrer && $proxy_images && !Helper\isSecureConnection()) { // only proxy enabled, but not connected via HTTPS return $html; } $config = new PicoFeedConfig(); $config->setFilterImageProxyUrl('?action=proxy&url=%s'); if (!$cloak_referrer && $proxy_images) { // image proxy mode only: https links do not need to be proxied, since // they do not trigger mixed content warnings. $config->setFilterImageProxyProtocol('http'); } elseif (!$proxy_images && $cloak_referrer && Helper\isSecureConnection()) { // cloaking mode only: if a request from a HTTPS connection to a HTTP // connection is made, the referrer will be omitted by the browser. // Only the referrer for HTTPS to HTTPs requests needs to be cloaked. $config->setFilterImageProxyProtocol('https'); } $filter = Filter::html($html, $website); $filter->setConfig($config); return $filter->execute(); }
/** * Find the item title. * * @param SimpleXMLElement $entry Feed item * @param \PicoFeed\Parser\Item $item Item object */ public function findItemTitle(SimpleXMLElement $entry, Item $item) { $title = XmlParser::getXPathResult($entry, 'title'); $item->title = Filter::stripWhiteSpace((string) current($title)) ?: $item->url; }
/** * Parse the document. * * @return \PicoFeed\Parser\Feed */ public function execute() { Logger::setMessage(get_called_class() . ': begin parsing'); $xml = XmlParser::getSimpleXml($this->content); if ($xml === false) { Logger::setMessage(get_called_class() . ': Applying XML workarounds'); $this->content = Filter::normalizeData($this->content); $xml = XmlParser::getSimpleXml($this->content); if ($xml === false) { Logger::setMessage(get_called_class() . ': XML parsing error'); Logger::setMessage(XmlParser::getErrors()); throw new MalformedXmlException('XML parsing error'); } } $this->used_namespaces = $xml->getNamespaces(true); $xml = $this->registerSupportedNamespaces($xml); $feed = new Feed(); $this->findFeedUrl($xml, $feed); $this->checkFeedUrl($feed); $this->findSiteUrl($xml, $feed); $this->checkSiteUrl($feed); $this->findFeedTitle($xml, $feed); $this->findFeedDescription($xml, $feed); $this->findFeedLanguage($xml, $feed); $this->findFeedId($xml, $feed); $this->findFeedDate($xml, $feed); $this->findFeedLogo($xml, $feed); $this->findFeedIcon($xml, $feed); foreach ($this->getItemsTree($xml) as $entry) { $entry = $this->registerSupportedNamespaces($entry); $item = new Item(); $item->xml = $entry; $item->namespaces = $this->used_namespaces; $this->findItemAuthor($xml, $entry, $item); $this->findItemUrl($entry, $item); $this->checkItemUrl($feed, $item); $this->findItemTitle($entry, $item); $this->findItemContent($entry, $item); // Id generation can use the item url/title/content (order is important) $this->findItemId($entry, $item, $feed); $this->findItemDate($entry, $item, $feed); $this->findItemEnclosure($entry, $item, $feed); $this->findItemLanguage($entry, $item, $feed); $this->itemPostProcessor->execute($feed, $item); $feed->items[] = $item; } Logger::setMessage(get_called_class() . PHP_EOL . $feed); return $feed; }
/** * Find the item title * * @access public * @param SimpleXMLElement $entry Feed item * @param \PicoFeed\Parser\Item $item Item object */ public function findItemTitle(SimpleXMLElement $entry, Item $item) { $item->title = Filter::stripWhiteSpace((string) $entry->title); if (empty($item->title)) { $item->title = $item->url; } }
public function testNormalizeData() { // invalid data link escape control character $this->assertEquals('<xml>random text</xml>', Filter::normalizeData("<xml>random text</xml>")); $this->assertEquals('<xml>random text</xml>', Filter::normalizeData("<xml>random text</xml>")); $this->assertEquals('<xml>random text</xml>', Filter::normalizeData("<xml>random text</xml>")); // invalid unit seperator control character (lower and upper case) $this->assertEquals('<xml>random text</xml>', Filter::normalizeData("<xml>random text</xml>")); $this->assertEquals('<xml>random text</xml>', Filter::normalizeData("<xml>random text</xml>")); $this->assertEquals('<xml>random text</xml>', Filter::normalizeData("<xml>random text</xml>")); $this->assertEquals('<xml>random text</xml>', Filter::normalizeData("<xml>random text</xml>")); $this->assertEquals('<xml>random text</xml>', Filter::normalizeData("<xml>random text</xml>")); /* * Do not test invalid multibyte characters. The output depends on php * version and character. * * php 5.3: always null * php >5.3: sometime null, sometimes the stripped string */ // invalid backspace control character + valid multibyte character $this->assertEquals('<xml>“random“ text</xml>', Filter::normalizeData("<xml>“random“ text</xml>")); $this->assertEquals('<xml>“random“ text</xml>', Filter::normalizeData("<xml>“random“ text</xml>")); $this->assertEquals('<xml>“random“ text</xml>', Filter::normalizeData("<xml>“random“ text</xml>")); // do not convert valid entities to utf-8 character $this->assertEquals('<xml attribute=""value"">random text</xml>', Filter::normalizeData('<xml attribute=""value"">random text</xml>')); $this->assertEquals('<xml attribute=""value"">random text</xml>', Filter::normalizeData('<xml attribute=""value"">random text</xml>')); }
/** * Convert the attribute list to html * * @access public * @param array $attributes Attributes * @return string */ public function toHtml(array $attributes) { $html = array(); foreach ($attributes as $attribute => $value) { $html[] = sprintf('%s="%s"', $attribute, Filter::escape($value)); } return implode(' ', $html); }
/** * Find the item title. * * @param SimpleXMLElement $entry Feed item * @param \PicoFeed\Parser\Item $item Item object */ public function findItemTitle(SimpleXMLElement $entry, Item $item) { $title = XmlParser::getXPathResult($entry, 'rss:title', $this->namespaces) ?: XmlParser::getXPathResult($entry, 'title') ?: $entry->title; $item->setTitle(Filter::stripWhiteSpace(XmlParser::getValue($title)) ?: $item->getUrl()); }
public function testsetFilterImageProxyCallback() { $config = new Config(); $config->setFilterImageProxyCallback(function ($image_url) { $key = hash_hmac('sha1', $image_url, 'secret'); return 'https://mypublicproxy/' . $key . '/' . rawurlencode($image_url); }); $f = Filter::html('<p>Image <img src="/image.png" alt="My Image"/></p>', 'http://foo'); $f->setConfig($config); $this->assertEquals('<p>Image <img src="https://mypublicproxy/4924964043f3119b3cf2b07b1922d491bcc20092/' . rawurlencode('http://foo/image.png') . '" alt="My Image"/></p>', $f->execute()); }
/** * Normalize encoding and strip head tag. */ public function prepareHtml() { $html_encoding = XmlParser::getEncodingFromMetaTag($this->html); $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding); $this->html = Filter::stripHeadTags($this->html); Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $this->encoding . '" ; HTML Encoding "' . $html_encoding . '"'); }
/** * Parse the HTML content * * @access public * @return bool */ public function parse() { if ($this->skip_processing) { return true; } if ($this->html) { $html_encoding = XmlParser::getEncodingFromMetaTag($this->html); // Encode everything in UTF-8 Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $this->encoding . '" ; HTML Encoding "' . $html_encoding . '"'); $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding); $this->html = Filter::stripHeadTags($this->html); Logger::setMessage(get_called_class() . ': Content length: ' . strlen($this->html) . ' bytes'); $rules = $this->getRules(); if (is_array($rules)) { Logger::setMessage(get_called_class() . ': Parse content with rules'); $this->parseContentWithRules($rules); } else { Logger::setMessage(get_called_class() . ': Parse content with candidates'); $this->parseContentWithCandidates(); } } else { Logger::setMessage(get_called_class() . ': No content fetched'); } Logger::setMessage(get_called_class() . ': Content length: ' . strlen($this->content) . ' bytes'); Logger::setMessage(get_called_class() . ': Grabber done'); return $this->content !== ''; }
/** * Find the item title. * * @param SimpleXMLElement $entry Feed item * @param \PicoFeed\Parser\Item $item Item object */ public function findItemTitle(SimpleXMLElement $entry, Item $item) { $value = XmlParser::getXPathResult($entry, 'title'); $item->setTitle(Filter::stripWhiteSpace(XmlParser::getValue($value)) ?: $item->getUrl()); }
/** * Parse tag content. * * @param resource $parser XML parser * @param string $content Tag content */ public function dataTag($parser, $content) { // Replace with normal space $content = str_replace(" ", ' ', $content); $this->output .= Filter::escape($content); }