/** * Construct the final URL from location headers. * * @param array $headers List of HTTP response header */ private function setEffectiveUrl($headers) { foreach ($headers as $header) { if (stripos($header, 'Location') === 0) { list(, $value) = explode(': ', $header); $this->url = Url::resolve($value, $this->url); } } }
/** * Find feed urls inside a HTML document. * * @param string $url Website url * @param string $html HTML content * * @return array List of feed links */ public function find($url, $html) { Logger::setMessage(get_called_class() . ': Try to discover subscriptions'); $dom = XmlParser::getHtmlDocument($html); $xpath = new DOMXPath($dom); $links = array(); $queries = array('//link[@type="application/rss+xml"]', '//link[@type="application/atom+xml"]'); foreach ($queries as $query) { $nodes = $xpath->query($query); foreach ($nodes as $node) { $link = $node->getAttribute('href'); if (!empty($link)) { $feedUrl = new Url($link); $siteUrl = new Url($url); $links[] = $feedUrl->getAbsoluteUrl($feedUrl->isRelativeUrl() ? $siteUrl->getBaseUrl() : ''); } } } Logger::setMessage(get_called_class() . ': ' . implode(', ', $links)); return $links; }
/** * Check if the item url is correct. * * @param Feed $feed Feed object * @param Item $item Item object */ public function checkItemUrl(Feed $feed, Item $item) { $item->url = Url::resolve($item->getUrl(), $feed->getSiteUrl()); }
/** * Handle manually redirections when there is an open base dir restriction. * * @param string $location Redirected URL * * @return array */ private function handleRedirection($location) { $nb_redirects = 0; $result = array(); $this->url = Url::resolve($location, $this->url); $this->body = ''; $this->body_length = 0; $this->response_headers = array(); $this->response_headers_count = 0; while (true) { ++$nb_redirects; if ($nb_redirects >= $this->max_redirects) { throw new MaxRedirectException('Maximum number of redirections reached'); } $result = $this->doRequest(false); if ($this->isRedirection($result['status'])) { $this->url = Url::resolve($result['headers']['Location'], $this->url); $this->body = ''; $this->body_length = 0; $this->response_headers = array(); $this->response_headers_count = 0; } else { break; } } return $result; }
/** * Find the item enclosure. * * @param SimpleXMLElement $entry Feed item * @param \AsteFeed\Parser\Item $item Item object * @param \AsteFeed\Parser\Feed $feed Feed object */ public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed) { $enclosure = $this->findLink($entry, 'enclosure'); if ($enclosure) { $item->enclosure_url = Url::resolve((string) $enclosure['href'], $feed->getSiteUrl()); $item->enclosure_type = (string) $enclosure['type']; } }
/** * Get the parser. * * @return ParserInterface */ public function getParser() { $ruleLoader = new RuleLoader($this->config); $rules = $ruleLoader->getRules($this->url); if (!empty($rules['grabber'])) { Logger::setMessage(get_called_class() . ': Parse content with rules'); foreach ($rules['grabber'] as $pattern => $rule) { $url = new Url($this->url); $sub_url = $url->getFullPath(); if (preg_match($pattern, $sub_url)) { Logger::setMessage(get_called_class() . ': Matched url ' . $sub_url); return new RuleParser($this->html, $rule); } } } elseif ($this->enableCandidateParser) { Logger::setMessage(get_called_class() . ': Parse content with candidates'); return new CandidateParser($this->html); } return; }
/** * Get the icon link for a website. * * @param string $website_link URL * @param string $favicon_link optional URL * * @return string */ public function find($website_link, $favicon_link = '') { $website = new Url($website_link); if ($favicon_link !== '') { $icons = array($favicon_link); } else { $icons = $this->extract($this->download($website->getBaseUrl('/'))->getContent()); $icons[] = $website->getBaseUrl('/favicon.ico'); } foreach ($icons as $icon_link) { $icon_link = Url::resolve($icon_link, $website); $resource = $this->download($icon_link); $this->content = $resource->getContent(); $this->content_type = $resource->getContentType(); if ($this->content !== '') { return $icon_link; } elseif ($favicon_link !== '') { return $this->find($website_link); } } return ''; }
/** * Find the item enclosure. * * @param SimpleXMLElement $entry Feed item * @param \AsteFeed\Parser\Item $item Item object * @param \AsteFeed\Parser\Feed $feed Feed object */ public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed) { if (isset($entry->enclosure)) { $enclosure_url = XmlParser::getXPathResult($entry, 'feedburner:origEnclosureLink', $this->namespaces) ?: XmlParser::getXPathResult($entry, 'enclosure/@url'); $enclosure_type = XmlParser::getXPathResult($entry, 'enclosure/@type'); $item->enclosure_url = Url::resolve((string) current($enclosure_url), $feed->getSiteUrl()); $item->enclosure_type = (string) current($enclosure_type); } }
/** * Convert all relative links to absolute url. * * @param string $tag Tag name * @param string $attribute Attribute name * @param string $value Attribute value * * @return bool */ public function rewriteAbsoluteUrl($tag, $attribute, &$value) { if ($this->isResource($attribute)) { $value = Url::resolve($value, $this->website); } return true; }
/** * Called after XML parsing. * * @param string $content the content that should be filtered */ public function filterRules($content) { // the constructor should require a config, then this if can be removed if ($this->config === null) { $config = new Config(); } else { $config = $this->config; } $loader = new RuleLoader($config); $rules = $loader->getRules($this->website); $url = new Url($this->website); $sub_url = $url->getFullPath(); if (isset($rules['filter'])) { foreach ($rules['filter'] as $pattern => $rule) { if (preg_match($pattern, $sub_url)) { foreach ($rule as $search => $replace) { $content = preg_replace($search, $replace, $content); } } } } return $content; }