/** * Construct the final URL from location headers * * @access private * @param array $headers List of HTTP response header */ private function setEffectiveUrl($headers) { foreach ($headers as $header) { if (stripos($header, 'Location') === 0) { list(, $value) = explode(': ', $header); $this->url = Url::resolve($value, $this->url); } } }
/** * Check if the item url is correct * * @access public * @param Feed $feed Feed object * @param Item $item Item object */ public function checkItemUrl(Feed $feed, Item $item) { $item->url = Url::resolve($item->getUrl(), $feed->getSiteUrl()); }
/** * Find the item enclosure. * * @param SimpleXMLElement $entry Feed item * @param \PicoFeed\Parser\Item $item Item object * @param \PicoFeed\Parser\Feed $feed Feed object */ public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed) { if (isset($entry->enclosure)) { $enclosure_url = XmlParser::getXPathResult($entry, 'feedburner:origEnclosureLink', $this->namespaces) ?: XmlParser::getXPathResult($entry, 'enclosure/@url'); $enclosure_type = XmlParser::getXPathResult($entry, 'enclosure/@type'); $item->enclosure_url = Url::resolve((string) current($enclosure_url), $feed->getSiteUrl()); $item->enclosure_type = (string) current($enclosure_type); } }
/** * Handle HTTP redirects * * @param string $location Redirected URL * * @return array */ private function handleRedirection($location) { $nb_redirects = 0; $result = array(); $this->url = Url::resolve($location, $this->url); $this->body = ''; $this->body_length = 0; $this->response_headers = array(); $this->response_headers_count = 0; while (true) { ++$nb_redirects; if ($nb_redirects >= $this->max_redirects) { throw new MaxRedirectException('Maximum number of redirections reached'); } $result = $this->doRequest(); if ($this->isRedirection($result['status'])) { $this->url = Url::resolve($result['headers']['Location'], $this->url); $this->body = ''; $this->body_length = 0; $this->response_headers = array(); $this->response_headers_count = 0; } else { break; } } return $result; }
/** * Find the item enclosure * * @access public * @param SimpleXMLElement $entry Feed item * @param \PicoFeed\Parser\Item $item Item object * @param \PicoFeed\Parser\Feed $feed Feed object */ public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed) { if (isset($entry->enclosure)) { $item->enclosure_url = XmlParser::getNamespaceValue($entry->enclosure, $this->namespaces, 'origEnclosureLink'); if (empty($item->enclosure_url)) { $item->enclosure_url = isset($entry->enclosure['url']) ? (string) $entry->enclosure['url'] : ''; } $item->enclosure_type = isset($entry->enclosure['type']) ? (string) $entry->enclosure['type'] : ''; $item->enclosure_url = Url::resolve($item->enclosure_url, $feed->getSiteUrl()); } }
/** * Find the item enclosure * * @access public * @param SimpleXMLElement $entry Feed item * @param \PicoFeed\Parser\Item $item Item object * @param \PicoFeed\Parser\Feed $feed Feed object */ public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed) { $enclosure = $this->findLink($entry, 'enclosure'); if ($enclosure) { $item->enclosure_url = Url::resolve((string) $enclosure['href'], $feed->getSiteUrl()); $item->enclosure_type = (string) $enclosure['type']; } }
/** * Convert all relative links to absolute url * * @access public * @param string $tag Tag name * @param string $attribute Attribute name * @param string $value Attribute value * @return boolean */ public function rewriteAbsoluteUrl($tag, $attribute, &$value) { if ($this->isResource($attribute)) { $value = Url::resolve($value, $this->website); } return true; }
/** * Execute the scraper. */ public function execute($pageContent = '', $recursionDepth = 0) { $this->html = ''; $this->encoding = ''; $this->content = ''; $this->download(); $this->prepareHtml(); $parser = $this->getParser(); if ($parser !== null) { $maxRecursions = $this->config->getMaxRecursions(); if (!isset($maxRecursions)) { $maxRecursions = 25; } $pageContent .= $parser->execute(); // check if there is a link to next page and recursively get content (max 25 pages) if (($nextLink = $parser->findNextLink()) !== null && $recursionDepth < $maxRecursions) { $nextLink = Url::resolve($nextLink, $this->url); $this->setUrl($nextLink); $this->execute($pageContent, $recursionDepth + 1); } else { $this->content = $pageContent; } Logger::setMessage(get_called_class() . ': Content length: ' . strlen($this->content) . ' bytes'); } }
public function testResolve() { // relative link $this->assertEquals('http://miniflux.net/assets/img/favicon.png', Url::resolve('assets/img/favicon.png', 'http://miniflux.net')); // relative link + HTTPS $this->assertEquals('https://miniflux.net/assets/img/favicon.png', Url::resolve('assets/img/favicon.png', 'https://miniflux.net')); // absolute link $this->assertEquals('http://miniflux.net/assets/img/favicon.png', Url::resolve('/assets/img/favicon.png', 'http://miniflux.net')); // absolute link + HTTPS $this->assertEquals('https://miniflux.net/assets/img/favicon.png', Url::resolve('/assets/img/favicon.png', 'https://miniflux.net')); // Protocol relative link $this->assertEquals('http://google.com/assets/img/favicon.png', Url::resolve('//google.com/assets/img/favicon.png', 'http://miniflux.net')); // Protocol relative link + HTTPS $this->assertEquals('https://google.com/assets/img/favicon.png', Url::resolve('//google.com/assets/img/favicon.png', 'https://miniflux.net')); // URL same fqdn $this->assertEquals('http://miniflux.net/assets/img/favicon.png', Url::resolve('http://miniflux.net/assets/img/favicon.png', 'https://miniflux.net')); // URL different fqdn $this->assertEquals('https://www.google.com/assets/img/favicon.png', Url::resolve('https://www.google.com/assets/img/favicon.png', 'https://miniflux.net')); // HTTPS URL $this->assertEquals('https://miniflux.net/assets/img/favicon.png', Url::resolve('https://miniflux.net/assets/img/favicon.png', 'https://miniflux.net')); // empty string on missing website parameter $this->assertEquals('', Url::resolve('favicon.png', '')); // website only on missing icon parameter $this->assertEquals('https://miniflux.net/', Url::resolve('', 'https://miniflux.net')); // empty string on missing website and icon parameter $this->assertEquals('', Url::resolve('', '')); }
/** * Get the icon link for a website. * * @param string $website_link * URL * @param string $favicon_link * optional URL * * @return string */ public function find($website_link, $favicon_link = '') { $website = new Url($website_link); if ($favicon_link !== '') { $icons = array($favicon_link); } else { $icons = $this->extract($this->download($website->getBaseUrl('/'))->getContent()); $icons[] = $website->getBaseUrl('/favicon.ico'); } foreach ($icons as $icon_link) { $icon_link = Url::resolve($icon_link, $website); $resource = $this->download($icon_link); $this->content = $resource->getContent(); $this->content_type = $resource->getContentType(); if ($this->content !== '') { return $icon_link; } elseif ($favicon_link !== '') { return $this->find($website_link); } } return ''; }
/** * Find the item enclosure. * * @param SimpleXMLElement $entry Feed item * @param \PicoFeed\Parser\Item $item Item object * @param \PicoFeed\Parser\Feed $feed Feed object */ public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed) { if (isset($entry->enclosure)) { $type = XmlParser::getXPathResult($entry, 'enclosure/@type'); $url = XmlParser::getXPathResult($entry, 'feedburner:origEnclosureLink', $this->namespaces) ?: XmlParser::getXPathResult($entry, 'enclosure/@url'); $item->setEnclosureUrl(Url::resolve(XmlParser::getValue($url), $feed->getSiteUrl())); $item->setEnclosureType(XmlParser::getValue($type)); } }
public function testResolve() { // relative link $this->assertEquals('http://miniflux.net/assets/img/favicon.png', Url::resolve('assets/img/favicon.png', 'http://miniflux.net')); // relative link + HTTPS $this->assertEquals('https://miniflux.net/assets/img/favicon.png', Url::resolve('assets/img/favicon.png', 'https://miniflux.net')); // absolute link $this->assertEquals('http://miniflux.net/assets/img/favicon.png', Url::resolve('/assets/img/favicon.png', 'http://miniflux.net')); // absolute link + HTTPS $this->assertEquals('https://miniflux.net/assets/img/favicon.png', Url::resolve('/assets/img/favicon.png', 'https://miniflux.net')); // Protocol relative link $this->assertEquals('http://google.com/assets/img/favicon.png', Url::resolve('//google.com/assets/img/favicon.png', 'http://miniflux.net')); // Protocol relative link + HTTPS $this->assertEquals('https://google.com/assets/img/favicon.png', Url::resolve('//google.com/assets/img/favicon.png', 'https://miniflux.net')); // URL same fqdn $this->assertEquals('http://miniflux.net/assets/img/favicon.png', Url::resolve('http://miniflux.net/assets/img/favicon.png', 'https://miniflux.net')); // URL different fqdn $this->assertEquals('https://www.google.com/assets/img/favicon.png', Url::resolve('https://www.google.com/assets/img/favicon.png', 'https://miniflux.net')); // HTTPS URL $this->assertEquals('https://miniflux.net/assets/img/favicon.png', Url::resolve('https://miniflux.net/assets/img/favicon.png', 'https://miniflux.net')); // empty string on missing website parameter $this->assertEquals('', Url::resolve('favicon.png', '')); // website only on missing icon parameter $this->assertEquals('https://miniflux.net/', Url::resolve('', 'https://miniflux.net')); // empty string on missing website and icon parameter $this->assertEquals('', Url::resolve('', '')); // Test no-ascii paths $this->assertEquals('http://lesjoiesducode.fr/post/125336534020/quand-la-page-doit-%C3%AAtre-pixel-perfect', Url::resolve('http://lesjoiesducode.fr/post/125336534020/quand-la-page-doit-être-pixel-perfect', 'http://lesjoiesducode.fr/post/125336534020')); }