Example #1
0
function fof_update_feed($id)
{
    if (!$id) {
        return 0;
    }
    $feed = fof_db_get_feed_by_id($id);
    $url = $feed['feed_url'];
    fof_log("Updating {$url}");
    fof_db_feed_mark_attempted_cache($id);
    $rss = fof_parse($feed['feed_url']);
    if ($rss->error()) {
        fof_log("feed update failed: " . $rss->error(), "update");
        return array(0, "Error: <b>" . $rss->error() . "</b> <a href=\"http://feedvalidator.org/check?url={$url}\">try to validate it?</a>");
    }
    $sub = html_entity_decode($rss->subscribe_url(), ENT_QUOTES);
    $self_link = $rss->get_link(0, 'self');
    if ($self_link) {
        $sub = html_entity_decode($self_link, ENT_QUOTES);
    }
    fof_log("subscription url is {$sub}");
    $image = $feed['feed_image'];
    $image_cache_date = $feed['feed_image_cache_date'];
    if ($feed['feed_image_cache_date'] < time() - 7 * 24 * 60 * 60) {
        $image = $rss->get_favicon();
        $image_cache_date = time();
    }
    $title = $rss->get_title();
    if ($title == "") {
        $title = "[no title]";
    }
    fof_db_feed_update_metadata($id, $sub, $title, $rss->get_link(), $rss->get_description(), $image, $image_cache_date);
    $feed_id = $feed['feed_id'];
    $n = 0;
    if ($rss->get_items()) {
        foreach ($rss->get_items() as $item) {
            $link = $item->get_permalink();
            $title = $item->get_title();
            $content = $item->get_content();
            $date = $item->get_date('U');
            if (!$date) {
                $date = time();
            }
            $item_id = $item->get_id();
            if (!$item_id) {
                $item_id = $link;
            }
            $id = fof_db_find_item($feed_id, $item_id);
            if ($id == NULL) {
                $n++;
                global $fof_item_prefilters;
                foreach ($fof_item_prefilters as $filter) {
                    list($link, $title, $content) = $filter($item, $link, $title, $content);
                }
                $id = fof_db_add_item($feed_id, $item_id, $link, $title, $content, time(), $date, $date);
                fof_apply_tags($feed_id, $id);
                $republished = false;
                // this was a failed attempt to avoid duplicates when subscribing to
                // a "planet" type feed when you already have some of the feeds in the
                // planet subscribed.  in the end there were just too many cases where
                // dupes still got through (like the 'source' feed url being just slightly
                // different from the subscribed url).
                //
                // maybe a better approach would be simply using the Atom GUID as a
                // true *GU* ID.
                /*
                $source = $item->get_item_tags(SIMPLEPIE_NAMESPACE_ATOM_10, 'source');
                $links = $source[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['link'];
                
                if(is_array($links))
                {                    
                    foreach($links as $link)
                    {
                        if($link['attribs']['']['rel'] == 'self')
                        {
                            $feed_url = $link['attribs']['']['href'];
                                                        
                            $feed = fof_db_get_feed_by_url($feed_url);
                            
                            if($feed)
                            {
                                fof_log("was repub from $feed_url");
                                
                                $republished = true;
                                
                                $result = fof_get_subscribed_users($feed_id);
                                
                                $repub_subscribers = array();
                                while($row = fof_db_get_row($result))
                                {
                                   $repub_subscribers[] = $row['user_id'];
                                   fof_log("repub_sub: " . $row['user_id']);
                                }
                                
                                $result = fof_get_subscribed_users($feed['feed_id']);
                                
                                $original_subscribers = array();
                                while($row = fof_db_get_row($result))
                                {
                                   $original_subscribers[] = $row['user_id'];
                                   fof_log("orig_sub: " . $row['user_id']);
                                }
                                
                                $new_subscribers = array_diff($repub_subscribers, $original_subscribers);
                                
                                fof_db_mark_item_unread($new_subscribers, $id);
                                
                                $old_subscribers = array_intersect($original_subscribers, $repub_subscribers);
                
                                foreach($old_subscribers as $user)
                                {
                                    fof_tag_item($user, $id, 'republished');
                                }
                            }
                        }
                    }
                }
                */
                if (!$republished) {
                    fof_mark_item_unread($feed_id, $id);
                }
                fof_apply_plugin_tags($feed_id, $id, NULL);
            }
            $ids[] = $id;
        }
    }
    // optionally purge old items -  if 'purge' is set we delete items that are not
    // unread or starred, not currently in the feed or within sizeof(feed) items
    // of being in the feed, and are over 'purge' many days old
    $p =& FoF_Prefs::instance();
    $admin_prefs = $p->admin_prefs;
    if ($admin_prefs['purge'] != "") {
        fof_log('purge is ' . $admin_prefs['purge']);
        $count = count($ids);
        fof_log('items in feed: ' . $count);
        if (count($ids) != 0) {
            $in = implode(", ", $ids);
            global $FOF_ITEM_TABLE, $FOF_ITEM_TAG_TABLE;
            $sql = "select item_id, item_cached from {$FOF_ITEM_TABLE} where feed_id = {$feed_id} and item_id not in ({$in}) order by item_cached desc limit {$count}, 1000000000";
            $result = fof_db_query($sql);
            while ($row = fof_db_get_row($result)) {
                if ($row['item_cached'] < time() - $admin_prefs['purge'] * 24 * 60 * 60) {
                    if (!fof_item_has_tags($row['item_id'])) {
                        $delete[] = $row['item_id'];
                    }
                }
            }
            $ndelete = count($delete);
            if (count($delete) != 0) {
                $in = implode(", ", $delete);
                fof_db_query("delete from {$FOF_ITEM_TABLE} where item_id in ({$in})");
                fof_db_query("delete from {$FOF_ITEM_TAG_TABLE} where item_id in ({$in})");
            }
        }
    }
    unset($rss);
    fof_db_feed_mark_cached($feed_id);
    $log = "feed update complete, {$n} new items, {$ndelete} items purged";
    if ($admin_prefs['purge'] == "") {
        $log .= " (purging disabled)";
    }
    fof_log($log, "update");
    return array($n, "");
}
Example #2
0
function fof_update_feed($id)
{
    global $fof_item_prefilters;
    static $blacklist = null;
    static $admin_prefs = null;
    fof_log("fof_update_feed({$id})");
    if ($blacklist === null) {
        $p =& FoF_Prefs::instance();
        $admin_prefs = $p->admin_prefs;
        $blacklist = preg_split('/(\\r\\n|\\r|\\n)/', isset($admin_prefs['blacklist']) ? $admin_prefs['blacklist'] : NULL, -1, PREG_SPLIT_NO_EMPTY);
    }
    if (empty($id)) {
        fof_log("Empty feed ID", "update");
        return array(0, '');
    }
    $feed = fof_db_get_feed_by_id($id);
    if (empty($feed)) {
        fof_log("no such feed '{$id}'", "update");
        return array(0, "Error: <b>no such feed '{$id}'</b>");
    }
    fof_log("updating feed_id:{$id} url:'" . $feed['feed_url'] . "'", "update");
    fof_db_feed_mark_attempted_cache($id);
    if (($rss = fof_parse($feed['feed_url'])) === false || $rss->error()) {
        if ($rss !== false) {
            $rss_error = $rss->error();
        }
        if (empty($rss_error)) {
            $rss_error = 'unknown error';
        }
        fof_db_feed_update_attempt_status($id, $rss_error);
        fof_log("feed update failed feed_id:{$id} url:'" . $feed['feed_url'] . "': " . $rss_error, "update");
        return array(0, "Error: <b>failed to parse feed '" . $feed['feed_url'] . "'</b>: {$rss_error}");
    }
    fof_db_feed_update_attempt_status($id, NULL);
    $feed_image = $feed['feed_image'];
    $feed_image_cache_date = $feed['feed_image_cache_date'];
    /* periodically update the feed's image */
    /* or if cached image file does not exist */
    if ($feed['feed_image_cache_date'] + FEED_IMAGE_CACHE_REFRESH_SECS < time() || !empty($feed_image) && !file_exists($feed_image)) {
        /*
        		Feed images tend to be larger and less-square than favicons, but
        		are more likely to be directly related to the feed, so are being
        		given the first chance at representing the feed.
        		Perhaps the prioritization should be configurable by preference,
        		or check the dimensions and prefer a favicon if feedimage is over
        		some size?
        */
        $feed_image_url = $rss->get_image_url();
        if (!empty($feed_image_url) && ($new_feed_image = fof_cache_image_url($feed_image_url)) !== false) {
            /* Use the image specified by the feed, if we can get it. */
            $feed_image = $new_feed_image;
            $feed_image_cache_date = time();
        } else {
            if (($new_feed_image = fof_get_favicon($feed['feed_link'])) !== false) {
                /* Use the feed site's favicon, if we can. */
                $feed_image = $new_feed_image;
                $feed_image_cache_date = time();
            }
        }
    }
    $feed_title = $rss->get_title();
    $feed_link = $rss->get_link();
    $feed_description = $rss->get_description();
    /* set the feed's current information */
    fof_db_feed_update_metadata($id, $feed_title, $feed_link, $feed_description, $feed_image, $feed_image_cache_date);
    $feed_id = $feed['feed_id'];
    $n = 0;
    // Set up the dynamic updates here, so we can include would-be-purged items
    $purgedUpdTimes = array();
    $count_Added = 0;
    $items_in_feed = 0;
    if ($rss->get_items()) {
        foreach ($rss->get_items() as $item) {
            $items_in_feed++;
            $title = $item->get_title();
            foreach ($blacklist as $bl) {
                if (stristr($title, $bl) !== false) {
                    fof_log($feed_id . ": Item title \"{$title}\" contained blacklisted term \"{$bl}\"", 'update');
                    continue 2;
                }
            }
            if (empty($title)) {
                fof_log($feed_id . ': Item had no title', 'update');
                $title = '[no title]';
            }
            $link = $item->get_permalink();
            if (empty($link)) {
                // Some feeds don't furnish an item link...
                fof_log($feed_id . ': Item had no link; synthesizing', 'update');
                $link = $feed['feed_link'];
            }
            $content = $item->get_content();
            if (!$content) {
                fof_log($feed_id . ': Item has no content', 'update');
                $content = '';
            }
            $authors = $item->get_authors();
            $author = '';
            if (!empty($authors) && is_array($authors)) {
                foreach ($authors as $aobj) {
                    $author .= " " . $aobj->get_name() . " " . $aobj->get_email();
                }
            }
            // don't fetch entries older than the purge limit
            $date = $item->get_date('U');
            if (!$date || $date > time()) {
                // Item either didn't come with a date or it was nonsensical (to be fair, RFC822 is terrible), so use the current time instead
                $date = time();
                fof_log($feed_id . ": item {$link} had no date; synthesizing", 'update');
            } elseif (!empty($admin_prefs['purge']) && $date <= time() - $admin_prefs['purge'] * 24 * 3600) {
                fof_log($feed_id . ": item {$link} is older than cutoff", 'update');
                $purgedUpdTimes[] = $date;
                continue;
            }
            foreach ($fof_item_prefilters as $filter) {
                list($link, $title, $content) = $filter($item, $link, $title, $content);
            }
            /* check if item already known */
            $item_id = $item->get_id();
            $id = fof_db_find_item($feed_id, $item_id);
            if ($id == NULL) {
                $n++;
                $id = fof_db_add_item($feed_id, $item_id, $link, $title, $content, time(), $date, $date, $author);
                fof_apply_tags($feed_id, $id);
                $count_Added++;
                // FIXME: what is this for?
                $republished = false;
                if (!$republished) {
                    fof_mark_item_unread($feed_id, $id);
                }
                fof_apply_plugin_tags($feed_id, $id, NULL);
            } else {
                fof_db_update_item($feed_id, $item_id, $link, time(), $author);
            }
        }
    }
    unset($rss);
    if (!empty($admin_prefs['dynupdates'])) {
        // Determine the average time between items, to determine the next update time
        $count = 0;
        $lastTime = 0;
        $totalDelta = 0.0;
        $totalDeltaSquare = 0.0;
        // Accumulate the times for the pre-purged items
        sort($purgedUpdTimes, SORT_NUMERIC);
        foreach ($purgedUpdTimes as $time) {
            if ($count > 0) {
                $delta = $time - $lastTime;
                $totalDelta += $delta;
                $totalDeltaSquare += $delta * $delta;
            }
            $lastTime = $time;
            $count++;
        }
        // Accumulate the times for the stored items
        $result = fof_db_items_updated_list($feed_id);
        while ($row = fof_db_get_row($result)) {
            if ($count > 0) {
                $delta = (double) ($row['item_updated'] - $lastTime);
                $totalDelta += $delta;
                $totalDeltaSquare += $delta * $delta;
            }
            $count++;
            $lastTime = $row['item_updated'];
        }
        // If there were no new items, use the time since the last one to grow the window
        if (!$count_Added) {
            $delta = time() - $lastTime;
            if ($delta > 0 && $count > 0) {
                $totalDelta += $delta;
                $totalDeltaSquare += $delta * $delta;
                $count++;
            }
        }
        $mean = 0;
        $stdev = 0;
        if ($count > 0) {
            $mean = $totalDelta / $count;
            if ($count > 1) {
                $stdev = sqrt(($count * $totalDeltaSquare - $totalDelta * $totalDelta) / ($count * ($count - 1)));
            }
        } else {
            // We don't have any items to go on, so let's just say it's a day
            $mean = 86400;
        }
        $now = time();
        $lastInterval = $now - $lastTime;
        // This algorithm is rife with fiddling, and I really need to generate metrics to test the efficacy
        $nextInterval = max($lastTime + $nextInterval, $now);
        if ($count_Added > 1) {
            // We missed an update, so make the interval shorter by how much we missed it by
            $nextInterval -= lastInterval;
        }
        // fudge factor
        $nextInterval += $stdev / ($count_Added + 1);
        // Always check at least twice a day
        $nextTime = min($nextInterval, $now + 86400 / 2);
        fof_log($feed['feed_title'] . ": Next feed update in " . ($nextTime - $now) . " seconds;" . " count={$count} t={$totalDelta} t2={$totalDeltaSquare}" . " mean={$mean} stdev={$stdev}");
        if ($count_Added > 0) {
            // In a perfect world, we want both of these numbers to be low
            fof_log("DYNUPDATE_ADD {$feed_id} count {$count_Added} overstep {$lastInterval}");
        } else {
            fof_log("DYNUPDATE_NONE {$feed_id} since {$lastInterval}");
        }
        fof_db_feed_cache_set($feed_id, (int) round($nextTime));
    }
    $delete = array();
    /*  If 'purge' preference is set, we delete any items that are not tagged by
    	by anything other than 'folded', are older than 'purge' days, and are
    	not one of the most recent 'purge_grace' items in the feed.
    
    	FIXME: behavior question: should auto-tagged feeds purge items with
    	their auto-tags set?
    	 */
    if (!empty($admin_prefs['purge'])) {
        /*  Always keep at least as many items as feed provides, or as set by
        		preferences.
        		 */
        $grace = $items_in_feed;
        if (!empty($admin_prefs['purge_grace'])) {
            $grace = max($grace, $admin_prefs['purge_grace']);
        }
        /* It's okay to purge 'folded' items. */
        $ignore_tags = array('folded');
        fof_log('purge is ' . $admin_prefs['purge']);
        $result = fof_db_items_purge_list($feed_id, $admin_prefs['purge'], $grace, $ignore_tags);
        while (($row = fof_db_get_row($result)) !== false) {
            $delete[] = $row['item_id'];
        }
    }
    /*  If 'match_similarity' preference is set, we delete any items with
    	matching titles and similar content.
    	 */
    if (!empty($admin_prefs['match_similarity'])) {
        $threshold = $admin_prefs['match_similarity'];
        $result = fof_db_items_duplicate_list();
        while ($row = fof_db_get_row($result)) {
            $similarity = 0;
            similar_text($row['c1'], $row['c2'], $similarity);
            if ($similarity > $threshold) {
                $delete[] = $row['item_id'];
            }
        }
    }
    fof_db_items_delete($delete);
    fof_db_feed_mark_cached($feed_id);
    $log = "feed update complete, {$n} new items, " . count($delete) . " items purged";
    if (empty($admin_prefs['purge'])) {
        $log .= " (purging disabled)";
    }
    fof_log($log, "update");
    return array($n, "");
}