/**
  * Execute Item Processor
  *
  * @access public
  * @param  Feed $feed
  * @param  Item $item
  * @return bool
  */
 public function execute(Feed $feed, Item $item)
 {
     if ($this->config->getContentFiltering(true)) {
         $filter = Filter::html($item->getContent(), $feed->getSiteUrl());
         $filter->setConfig($this->config);
         $item->setContent($filter->execute());
     } else {
         Logger::setMessage(get_called_class() . ': Content filtering disabled');
     }
 }
Пример #2
0
function download($url)
{
    if ((bool) Config\get('debug_mode')) {
        Logger::enable();
    }
    $client = Client::getInstance();
    $client->setUserAgent(Config\HTTP_USER_AGENT);
    $client->enablePassthroughMode();
    $client->execute($url);
    Config\write_debug();
}
Пример #3
0
 /**
  * Load a rule file from the defined folder.
  *
  * @param string $folder Rule directory
  * @param array  $files  List of possible file names
  *
  * @return array
  */
 public function loadRuleFile($folder, array $files)
 {
     foreach ($files as $file) {
         $filename = $folder . '/' . $file . '.php';
         if (file_exists($filename)) {
             Logger::setMessage(get_called_class() . ' Load rule: ' . $file);
             return include $filename;
         }
     }
     return array();
 }
Пример #4
0
 /**
  * Parse the OPML file.
  *
  * @return array|false
  */
 public function execute()
 {
     Logger::setMessage(get_called_class() . ': start importation');
     $xml = XmlParser::getSimpleXml(trim($this->content));
     if ($xml === false || $xml->getName() !== 'opml' || !isset($xml->body)) {
         Logger::setMessage(get_called_class() . ': OPML tag not found or malformed XML document');
         return false;
     }
     $this->parseEntries($xml->body);
     Logger::setMessage(get_called_class() . ': ' . count($this->items) . ' subscriptions found');
     return $this->items;
 }
Пример #5
0
 /**
  * Prepare stream context
  *
  * @access private
  * @return array
  */
 private function prepareContext()
 {
     $context = array('http' => array('method' => 'GET', 'protocol_version' => 1.1, 'timeout' => $this->timeout, 'max_redirects' => $this->max_redirects));
     if ($this->proxy_hostname) {
         Logger::setMessage(get_called_class() . ' Proxy: ' . $this->proxy_hostname . ':' . $this->proxy_port);
         $context['http']['proxy'] = 'tcp://' . $this->proxy_hostname . ':' . $this->proxy_port;
         $context['http']['request_fulluri'] = true;
         if ($this->proxy_username) {
             Logger::setMessage(get_called_class() . ' Proxy credentials: Yes');
         } else {
             Logger::setMessage(get_called_class() . ' Proxy credentials: No');
         }
     }
     $context['http']['header'] = implode("\r\n", $this->prepareHeaders());
     return $context;
 }
Пример #6
0
 /**
  * Parse HTTP headers.
  *
  * @static
  *
  * @param array $lines
  *        	List of headers
  *        	
  * @return array
  */
 public static function parse(array $lines)
 {
     $status = 0;
     $headers = array();
     foreach ($lines as $line) {
         if (strpos($line, 'HTTP/1') === 0) {
             $headers = array();
             $status = (int) substr($line, 9, 3);
         } elseif (strpos($line, ': ') !== false) {
             list($name, $value) = explode(': ', $line);
             if ($value) {
                 $headers[trim($name)] = trim($value);
             }
         }
     }
     Logger::setMessage(get_called_class() . ' HTTP status code: ' . $status);
     foreach ($headers as $name => $value) {
         Logger::setMessage(get_called_class() . ' HTTP header: ' . $name . ' => ' . $value);
     }
     return array($status, new self($headers));
 }
Пример #7
0
 /**
  * Parse the document.
  *
  * @return \PicoFeed\Parser\Feed
  */
 public function execute()
 {
     Logger::setMessage(get_called_class() . ': begin parsing');
     $xml = XmlParser::getSimpleXml($this->content);
     if ($xml === false) {
         Logger::setMessage(get_called_class() . ': Applying XML workarounds');
         $this->content = Filter::normalizeData($this->content);
         $xml = XmlParser::getSimpleXml($this->content);
         if ($xml === false) {
             Logger::setMessage(get_called_class() . ': XML parsing error');
             Logger::setMessage(XmlParser::getErrors());
             throw new MalformedXmlException('XML parsing error');
         }
     }
     $this->used_namespaces = $xml->getNamespaces(true);
     $xml = $this->registerSupportedNamespaces($xml);
     $feed = new Feed();
     $this->findFeedUrl($xml, $feed);
     $this->checkFeedUrl($feed);
     $this->findSiteUrl($xml, $feed);
     $this->checkSiteUrl($feed);
     $this->findFeedTitle($xml, $feed);
     $this->findFeedDescription($xml, $feed);
     $this->findFeedLanguage($xml, $feed);
     $this->findFeedId($xml, $feed);
     $this->findFeedDate($xml, $feed);
     $this->findFeedLogo($xml, $feed);
     $this->findFeedIcon($xml, $feed);
     foreach ($this->getItemsTree($xml) as $entry) {
         $entry = $this->registerSupportedNamespaces($entry);
         $item = new Item();
         $item->xml = $entry;
         $item->namespaces = $this->used_namespaces;
         $this->findItemAuthor($xml, $entry, $item);
         $this->findItemUrl($entry, $item);
         $this->checkItemUrl($feed, $item);
         $this->findItemTitle($entry, $item);
         $this->findItemContent($entry, $item);
         // Id generation can use the item url/title/content (order is important)
         $this->findItemId($entry, $item, $feed);
         $this->findItemDate($entry, $item, $feed);
         $this->findItemEnclosure($entry, $item, $feed);
         $this->findItemLanguage($entry, $item, $feed);
         $this->itemPostProcessor->execute($feed, $item);
         $feed->items[] = $item;
     }
     Logger::setMessage(get_called_class() . PHP_EOL . $feed);
     return $feed;
 }
Пример #8
0
 /**
  * Execute curl context.
  */
 private function executeContext()
 {
     $ch = $this->prepareContext();
     curl_exec($ch);
     Logger::setMessage(get_called_class() . ' cURL total time: ' . curl_getinfo($ch, CURLINFO_TOTAL_TIME));
     Logger::setMessage(get_called_class() . ' cURL dns lookup time: ' . curl_getinfo($ch, CURLINFO_NAMELOOKUP_TIME));
     Logger::setMessage(get_called_class() . ' cURL connect time: ' . curl_getinfo($ch, CURLINFO_CONNECT_TIME));
     Logger::setMessage(get_called_class() . ' cURL speed download: ' . curl_getinfo($ch, CURLINFO_SPEED_DOWNLOAD));
     Logger::setMessage(get_called_class() . ' cURL effective url: ' . curl_getinfo($ch, CURLINFO_EFFECTIVE_URL));
     $curl_errno = curl_errno($ch);
     if ($curl_errno) {
         Logger::setMessage(get_called_class() . ' cURL error: ' . curl_error($ch));
         curl_close($ch);
         $this->handleError($curl_errno);
     }
     // Update the url if there where redirects
     $this->url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
     curl_close($ch);
 }
Пример #9
0
 /**
  * Handle not modified response.
  *
  * @param array $response Client response
  */
 public function handleNotModifiedResponse(array $response)
 {
     if ($response['status'] == 304) {
         $this->is_modified = false;
     } elseif ($response['status'] == 200) {
         $this->is_modified = $this->hasBeenModified($response, $this->etag, $this->last_modified);
         $this->etag = $this->getHeader($response, 'ETag');
         $this->last_modified = $this->getHeader($response, 'Last-Modified');
     }
     if ($this->is_modified === false) {
         Logger::setMessage(get_called_class() . ' Resource not modified');
     }
 }
Пример #10
0
 /**
  * Return false if the node should not be removed.
  *
  * @param DomDocument $dom
  * @param DomNode     $node
  *
  * @return bool
  */
 public function shouldRemove(DomDocument $dom, $node)
 {
     $document_length = strlen($dom->textContent);
     $node_length = strlen($node->textContent);
     if ($document_length === 0) {
         return true;
     }
     $ratio = $node_length * 100 / $document_length;
     if ($ratio >= 90) {
         Logger::setMessage(get_called_class() . ': Should not remove this node (' . $node->nodeName . ') ratio: ' . $ratio . '%');
         return false;
     }
     return true;
 }
Пример #11
0
function write_debug()
{
    if ((bool) get('debug_mode')) {
        file_put_contents(DEBUG_FILENAME, implode(PHP_EOL, Logger::getMessages()));
    }
}
Пример #12
0
 /**
  * Find feed urls inside a HTML document
  *
  * @access public
  * @param  string    $url        Website url
  * @param  string    $html       HTML content
  * @return array                 List of feed links
  */
 public function find($url, $html)
 {
     Logger::setMessage(get_called_class() . ': Try to discover subscriptions');
     $dom = XmlParser::getHtmlDocument($html);
     $xpath = new DOMXPath($dom);
     $links = array();
     $queries = array('//link[@type="application/rss+xml"]', '//link[@type="application/atom+xml"]');
     foreach ($queries as $query) {
         $nodes = $xpath->query($query);
         foreach ($nodes as $node) {
             $link = $node->getAttribute('href');
             if (!empty($link)) {
                 $feedUrl = new Url($link);
                 $siteUrl = new Url($url);
                 $links[] = $feedUrl->getAbsoluteUrl($feedUrl->isRelativeUrl() ? $siteUrl->getBaseUrl() : '');
             }
         }
     }
     Logger::setMessage(get_called_class() . ': ' . implode(', ', $links));
     return $links;
 }
Пример #13
0
 /**
  * Normalize encoding and strip head tag.
  */
 public function prepareHtml()
 {
     $html_encoding = XmlParser::getEncodingFromMetaTag($this->html);
     $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding);
     $this->html = Filter::stripHeadTags($this->html);
     Logger::setMessage(get_called_class() . ': HTTP Encoding "' . $this->encoding . '" ; HTML Encoding "' . $html_encoding . '"');
 }
Пример #14
0
function cleanup($feed_id, array $items_in_feed)
{
    if (!empty($items_in_feed)) {
        $db = Database::getInstance('db');
        $removed_items = $db->table('items')->columns('id')->notin('id', $items_in_feed)->eq('status', 'removed')->eq('feed_id', $feed_id)->desc('updated')->findAllByColumn('id');
        // Keep a buffer of 2 items
        // It's workaround for buggy feeds (cache issue with some Wordpress plugins)
        if (is_array($removed_items)) {
            $items_to_remove = array_slice($removed_items, 2);
            if (!empty($items_to_remove)) {
                $nb_items = count($items_to_remove);
                Logger::setMessage('There is ' . $nb_items . ' items to remove');
                // Handle the case when there is a huge number of items to remove
                // Sqlite have a limit of 1000 sql variables by default
                // Avoid the error message "too many SQL variables"
                // We remove old items by batch of 500 items
                $chunks = array_chunk($items_to_remove, 500);
                foreach ($chunks as $chunk) {
                    $db->table('items')->in('id', $chunk)->eq('status', 'removed')->eq('feed_id', $feed_id)->remove();
                }
            }
        }
    }
}
Пример #15
0
 /**
  * Download and check if a resource exists.
  *
  * @param string $url
  *        	URL
  *        	
  * @return \PicoFeed\Client Client instance
  */
 public function download($url)
 {
     $client = Client::getInstance();
     $client->setConfig($this->config);
     Logger::setMessage(get_called_class() . ' Download => ' . $url);
     try {
         $client->execute($url);
     } catch (ClientException $e) {
         Logger::setMessage(get_called_class() . ' Download Failed => ' . $e->getMessage());
     }
     return $client;
 }
Пример #16
0
 /**
  * Constructor.
  *
  * @param \PicoFeed\Config\Config   $config   Config class instance
  */
 public function __construct(Config $config = null)
 {
     $this->config = $config ?: new Config();
     Logger::setTimezone($this->config->getTimezone());
 }