/** * Execute Item Processor * * @access public * @param Feed $feed * @param Item $item * @return bool */ public function execute(Feed $feed, Item $item) { if ($this->config->getContentFiltering(true)) { $filter = Filter::html($item->getContent(), $feed->getSiteUrl()); $filter->setConfig($this->config); $item->setContent($filter->execute()); } else { Logger::setMessage(get_called_class() . ': Content filtering disabled'); } }
function download($url) { if ((bool) Config\get('debug_mode')) { Logger::enable(); } $client = Client::getInstance(); $client->setUserAgent(Config\HTTP_USER_AGENT); $client->enablePassthroughMode(); $client->execute($url); Config\write_debug(); }
/** * Load a rule file from the defined folder. * * @param string $folder Rule directory * @param array $files List of possible file names * * @return array */ public function loadRuleFile($folder, array $files) { foreach ($files as $file) { $filename = $folder . '/' . $file . '.php'; if (file_exists($filename)) { Logger::setMessage(get_called_class() . ' Load rule: ' . $file); return include $filename; } } return array(); }
/** * Parse the OPML file. * * @return array|false */ public function execute() { Logger::setMessage(get_called_class() . ': start importation'); $xml = XmlParser::getSimpleXml(trim($this->content)); if ($xml === false || $xml->getName() !== 'opml' || !isset($xml->body)) { Logger::setMessage(get_called_class() . ': OPML tag not found or malformed XML document'); return false; } $this->parseEntries($xml->body); Logger::setMessage(get_called_class() . ': ' . count($this->items) . ' subscriptions found'); return $this->items; }
/** * Prepare stream context * * @access private * @return array */ private function prepareContext() { $context = array('http' => array('method' => 'GET', 'protocol_version' => 1.1, 'timeout' => $this->timeout, 'max_redirects' => $this->max_redirects)); if ($this->proxy_hostname) { Logger::setMessage(get_called_class() . ' Proxy: ' . $this->proxy_hostname . ':' . $this->proxy_port); $context['http']['proxy'] = 'tcp://' . $this->proxy_hostname . ':' . $this->proxy_port; $context['http']['request_fulluri'] = true; if ($this->proxy_username) { Logger::setMessage(get_called_class() . ' Proxy credentials: Yes'); } else { Logger::setMessage(get_called_class() . ' Proxy credentials: No'); } } $context['http']['header'] = implode("\r\n", $this->prepareHeaders()); return $context; }
/** * Parse HTTP headers. * * @static * * @param array $lines * List of headers * * @return array */ public static function parse(array $lines) { $status = 0; $headers = array(); foreach ($lines as $line) { if (strpos($line, 'HTTP/1') === 0) { $headers = array(); $status = (int) substr($line, 9, 3); } elseif (strpos($line, ': ') !== false) { list($name, $value) = explode(': ', $line); if ($value) { $headers[trim($name)] = trim($value); } } } Logger::setMessage(get_called_class() . ' HTTP status code: ' . $status); foreach ($headers as $name => $value) { Logger::setMessage(get_called_class() . ' HTTP header: ' . $name . ' => ' . $value); } return array($status, new self($headers)); }
/** * Parse the document. * * @return \PicoFeed\Parser\Feed */ public function execute() { Logger::setMessage(get_called_class() . ': begin parsing'); $xml = XmlParser::getSimpleXml($this->content); if ($xml === false) { Logger::setMessage(get_called_class() . ': Applying XML workarounds'); $this->content = Filter::normalizeData($this->content); $xml = XmlParser::getSimpleXml($this->content); if ($xml === false) { Logger::setMessage(get_called_class() . ': XML parsing error'); Logger::setMessage(XmlParser::getErrors()); throw new MalformedXmlException('XML parsing error'); } } $this->used_namespaces = $xml->getNamespaces(true); $xml = $this->registerSupportedNamespaces($xml); $feed = new Feed(); $this->findFeedUrl($xml, $feed); $this->checkFeedUrl($feed); $this->findSiteUrl($xml, $feed); $this->checkSiteUrl($feed); $this->findFeedTitle($xml, $feed); $this->findFeedDescription($xml, $feed); $this->findFeedLanguage($xml, $feed); $this->findFeedId($xml, $feed); $this->findFeedDate($xml, $feed); $this->findFeedLogo($xml, $feed); $this->findFeedIcon($xml, $feed); foreach ($this->getItemsTree($xml) as $entry) { $entry = $this->registerSupportedNamespaces($entry); $item = new Item(); $item->xml = $entry; $item->namespaces = $this->used_namespaces; $this->findItemAuthor($xml, $entry, $item); $this->findItemUrl($entry, $item); $this->checkItemUrl($feed, $item); $this->findItemTitle($entry, $item); $this->findItemContent($entry, $item); // Id generation can use the item url/title/content (order is important) $this->findItemId($entry, $item, $feed); $this->findItemDate($entry, $item, $feed); $this->findItemEnclosure($entry, $item, $feed); $this->findItemLanguage($entry, $item, $feed); $this->itemPostProcessor->execute($feed, $item); $feed->items[] = $item; } Logger::setMessage(get_called_class() . PHP_EOL . $feed); return $feed; }
/** * Execute curl context. */ private function executeContext() { $ch = $this->prepareContext(); curl_exec($ch); Logger::setMessage(get_called_class() . ' cURL total time: ' . curl_getinfo($ch, CURLINFO_TOTAL_TIME)); Logger::setMessage(get_called_class() . ' cURL dns lookup time: ' . curl_getinfo($ch, CURLINFO_NAMELOOKUP_TIME)); Logger::setMessage(get_called_class() . ' cURL connect time: ' . curl_getinfo($ch, CURLINFO_CONNECT_TIME)); Logger::setMessage(get_called_class() . ' cURL speed download: ' . curl_getinfo($ch, CURLINFO_SPEED_DOWNLOAD)); Logger::setMessage(get_called_class() . ' cURL effective url: ' . curl_getinfo($ch, CURLINFO_EFFECTIVE_URL)); $curl_errno = curl_errno($ch); if ($curl_errno) { Logger::setMessage(get_called_class() . ' cURL error: ' . curl_error($ch)); curl_close($ch); $this->handleError($curl_errno); } // Update the url if there where redirects $this->url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); curl_close($ch); }
/** * Handle not modified response. * * @param array $response Client response */ public function handleNotModifiedResponse(array $response) { if ($response['status'] == 304) { $this->is_modified = false; } elseif ($response['status'] == 200) { $this->is_modified = $this->hasBeenModified($response, $this->etag, $this->last_modified); $this->etag = $this->getHeader($response, 'ETag'); $this->last_modified = $this->getHeader($response, 'Last-Modified'); } if ($this->is_modified === false) { Logger::setMessage(get_called_class() . ' Resource not modified'); } }
/** * Return false if the node should not be removed. * * @param DomDocument $dom * @param DomNode $node * * @return bool */ public function shouldRemove(DomDocument $dom, $node) { $document_length = strlen($dom->textContent); $node_length = strlen($node->textContent); if ($document_length === 0) { return true; } $ratio = $node_length * 100 / $document_length; if ($ratio >= 90) { Logger::setMessage(get_called_class() . ': Should not remove this node (' . $node->nodeName . ') ratio: ' . $ratio . '%'); return false; } return true; }
function write_debug() { if ((bool) get('debug_mode')) { file_put_contents(DEBUG_FILENAME, implode(PHP_EOL, Logger::getMessages())); } }
/** * Find feed urls inside a HTML document * * @access public * @param string $url Website url * @param string $html HTML content * @return array List of feed links */ public function find($url, $html) { Logger::setMessage(get_called_class() . ': Try to discover subscriptions'); $dom = XmlParser::getHtmlDocument($html); $xpath = new DOMXPath($dom); $links = array(); $queries = array('//link[@type="application/rss+xml"]', '//link[@type="application/atom+xml"]'); foreach ($queries as $query) { $nodes = $xpath->query($query); foreach ($nodes as $node) { $link = $node->getAttribute('href'); if (!empty($link)) { $feedUrl = new Url($link); $siteUrl = new Url($url); $links[] = $feedUrl->getAbsoluteUrl($feedUrl->isRelativeUrl() ? $siteUrl->getBaseUrl() : ''); } } } Logger::setMessage(get_called_class() . ': ' . implode(', ', $links)); return $links; }
/** * Normalize encoding and strip head tag. */ public function prepareHtml() { $html_encoding = XmlParser::getEncodingFromMetaTag($this->html); $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding); $this->html = Filter::stripHeadTags($this->html); Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $this->encoding . '" ; HTML Encoding "' . $html_encoding . '"'); }
function cleanup($feed_id, array $items_in_feed) { if (!empty($items_in_feed)) { $db = Database::getInstance('db'); $removed_items = $db->table('items')->columns('id')->notin('id', $items_in_feed)->eq('status', 'removed')->eq('feed_id', $feed_id)->desc('updated')->findAllByColumn('id'); // Keep a buffer of 2 items // It's workaround for buggy feeds (cache issue with some Wordpress plugins) if (is_array($removed_items)) { $items_to_remove = array_slice($removed_items, 2); if (!empty($items_to_remove)) { $nb_items = count($items_to_remove); Logger::setMessage('There is ' . $nb_items . ' items to remove'); // Handle the case when there is a huge number of items to remove // Sqlite have a limit of 1000 sql variables by default // Avoid the error message "too many SQL variables" // We remove old items by batch of 500 items $chunks = array_chunk($items_to_remove, 500); foreach ($chunks as $chunk) { $db->table('items')->in('id', $chunk)->eq('status', 'removed')->eq('feed_id', $feed_id)->remove(); } } } } }
/** * Download and check if a resource exists. * * @param string $url * URL * * @return \PicoFeed\Client Client instance */ public function download($url) { $client = Client::getInstance(); $client->setConfig($this->config); Logger::setMessage(get_called_class() . ' Download => ' . $url); try { $client->execute($url); } catch (ClientException $e) { Logger::setMessage(get_called_class() . ' Download Failed => ' . $e->getMessage()); } return $client; }
/** * Constructor. * * @param \PicoFeed\Config\Config $config Config class instance */ public function __construct(Config $config = null) { $this->config = $config ?: new Config(); Logger::setTimezone($this->config->getTimezone()); }