Пример #1
0
 /**
  * Fetch and save the instance users's friends.
  *
  * if is_archive_loaded
  *     if ($this->instance->total_follows_in_system !== $this->user->friend_count) {
  *         is_archive_loaded = false;
  *
  * if !is_archive_loaded
  *     if followed_by_next_cursor is set
  *         pageThroughFriends(followed_by_next_cursor)
  *     else
  *         pageThroughFriends()
  *
  * if is_archive_loaded
  *     updateStaleFollows()
  *
  */
 public function fetchFriends()
 {
     if (!isset($this->user)) {
         //Force-refresh instance user in data store
         $this->user = self::fetchUser($this->instance->network_user_id, 'Owner info', $this->instance->network_username, null, null, true);
     }
     $follow_dao = DAOFactory::getDAO('FollowDAO');
     $this->instance->total_friends_in_system = $follow_dao->countTotalFriends($this->instance->network_user_id, 'instagram');
     $this->logger->logUserInfo($this->instance->total_friends_in_system . " friends in system, " . $this->user->friend_count . " friends according to Instagram", __METHOD__ . ',' . __LINE__);
     if ($this->instance->total_friends_in_system < $this->user->friend_count) {
         $this->instance->is_archive_loaded_friends = false;
     } elseif ($this->instance->total_friends_in_system > $this->user->friend_count) {
         $this->instance->is_archive_loaded_friends = true;
     } else {
         $this->instance->is_archive_loaded_friends = true;
     }
     //If archive is not loaded, page through friends
     if (!$this->instance->is_archive_loaded_friends) {
         $this->logger->logInfo("Friend archive  is not loaded, start paging", __METHOD__ . ',' . __LINE__);
         $this->pageThroughFriends($this->instance->follows_next_cursor);
     }
     //If archive is loaded, updateStaleFollows
     if ($this->instance->is_archive_loaded_friends) {
         $this->logger->logInfo("Friend archive loaded, start updating stale friendships", __METHOD__ . ',' . __LINE__);
         $this->updateStaleFollows(true);
     }
 }
Пример #2
0
    /**
     * Fetch a save the posts and replies on a Facebook page.
     * @param int $pid Page ID
     */
    public function fetchPagePostsAndReplies($pid) {
        $stream = FacebookGraphAPIAccessor::apiRequest('/'.$pid.'/posts', $this->access_token);

        if (isset($stream->data) && is_array($stream->data) && sizeof($stream->data > 0)) {
            $this->logger->logSuccess(sizeof($stream->data)." Facebook posts found for page ID $pid.",
            __METHOD__.','.__LINE__);

            $thinkup_data = $this->parseStream($stream, 'facebook page');
            $posts = $thinkup_data["posts"];

            $post_dao = DAOFactory::getDAO('PostDAO');
            $added_posts = 0;
            foreach ($posts as $post) {
                if ($post['author_username']== "" && isset($post['author_user_id'])) {
                    $commenter_object = $this->fetchUserInfo($post['author_user_id'], 'facebook',
                    'Facebook page comments');
                    if (isset($commenter_object)) {
                        $post["author_username"] = $commenter_object->full_name;
                        $post["author_fullname"] = $commenter_object->full_name;
                        $post["author_avatar"] = $commenter_object->avatar;
                    }
                }

                $added_posts = $added_posts + $post_dao->addPost($post);
                $this->logger->logInfo("Added post ID ".$post["post_id"]." on ".$post["network"].
                " for ".$post["author_username"].":".$post["post_text"], __METHOD__.','.__LINE__);
            }

            $added_users = 0;
            $users = $thinkup_data["users"];
            if (count($users) > 0) {
                foreach ($users as $user) {
                    $user["post_count"] = $post_dao->getTotalPostsByUser($user['user_id'], $user['network']);
                    $found_in = 'Facebook page stream';
                    $user_object = new User($user, $found_in);
                    $user_dao = DAOFactory::getDAO('UserDAO');
                    $user_dao->updateUser($user_object);
                    $added_users = $added_users + 1;
                }
            }
            if ($added_posts > 0 || $added_users > 0) {
                $this->logger->logUserSuccess($added_posts." post(s) added; ".$added_users." user(s) updated.",
                __METHOD__.','.__LINE__);
            } else {
                $this->logger->logUserInfo("No new page posts found.", __METHOD__.','.__LINE__);
            }
        } else {
            $this->logger->logInfo("No Facebook posts found for page ID $pid", __METHOD__.','.__LINE__);
        }
    }
Пример #3
0
 /**
  * Fetch instance user's favorites since the last favorite stored.
  */
 public function fetchInstanceUserFavorites()
 {
     if (!isset($this->user)) {
         $this->fetchInstanceUserInfo();
     }
     $this->logger->logUserInfo("Checking for new favorites.", __METHOD__ . ',' . __LINE__);
     $last_fav_id = $this->instance->last_favorite_id;
     $this->logger->logInfo("Owner favs: " . $this->user->favorites_count . ", instance owner favs in system: " . $this->instance->owner_favs_in_system, __METHOD__ . ',' . __LINE__);
     $continue = true;
     while ($continue) {
         list($tweets, $http_status, $payload) = $this->getFavorites($last_fav_id);
         if ($http_status == 200) {
             if (sizeof($tweets) == 0) {
                 // then done -- this should happen when we have run out of favs
                 $this->logger->logInfo("It appears that we have run out of favorites to process", __METHOD__ . ',' . __LINE__);
                 $continue = false;
             } else {
                 $post_dao = DAOFactory::getDAO('FavoritePostDAO');
                 $fav_count = 0;
                 foreach ($tweets as $tweet) {
                     $tweet['network'] = 'twitter';
                     if ($post_dao->addFavorite($this->user->user_id, $tweet) > 0) {
                         URLProcessor::processPostURLs($tweet['post_text'], $tweet['post_id'], 'twitter', $this->logger);
                         $this->logger->logInfo("Found new fav: " . $tweet['post_id'], __METHOD__ . ',' . __LINE__);
                         $fav_count++;
                         $this->logger->logInfo("Fav count: {$fav_count}", __METHOD__ . ',' . __LINE__);
                         $this->logger->logInfo("Added favorite: " . $tweet['post_id'], __METHOD__ . ',' . __LINE__);
                     } else {
                         // fav was already stored, so take no action. This could happen both because some
                         // of the favs on the given page were processed last time, or because a separate process,
                         // such as a UserStream process, is also watching for and storing favs.
                         //$status_message = "Have already stored fav ". $tweet['post_id'];
                         //$this->logger->logDebug($status_message, __METHOD__.','.__LINE__);
                     }
                     // keep track of the highest fav id we've encountered
                     if ($tweet['post_id'] > $last_fav_id) {
                         $last_fav_id = $tweet['post_id'];
                     }
                 }
                 // end foreach
             }
         } else {
             $continue = false;
         }
     }
 }
Пример #4
0
 /**
  * Fetch and save the posts and replies for the crawler's instance. This function will loop back through the
  * user's or pages archive of posts.
  * @return void
  * @throws APIOAuthException
  */
 public function fetchPostsAndReplies()
 {
     $id = $this->instance->network_user_id;
     $network = $this->instance->network;
     // fetch user's friends
     $this->fetchAndStoreFriends();
     $fetch_next_page = true;
     $current_page_number = 1;
     $next_api_request = 'https://graph.facebook.com/' . $id . '/feed?access_token=' . $this->access_token;
     //Cap crawl time for very busy pages with thousands of likes/comments
     $fetch_stop_time = time() + $this->max_crawl_time;
     //Determine 'since', datetime of oldest post in datastore
     $post_dao = DAOFactory::getDAO('PostDAO');
     $since_post = $post_dao->getAllPosts($id, $network, 1, 1, true, 'pub_date', 'ASC');
     $since = isset($since_post[0]) ? $since_post[0]->pub_date : 0;
     $since = strtotime($since) - 60 * 60 * 24;
     // last post minus one day, just to be safe
     $since < 0 ? $since = 0 : ($since = $since);
     while ($fetch_next_page) {
         $stream = FacebookGraphAPIAccessor::rawApiRequest($next_api_request, true);
         if (isset($stream->data) && is_array($stream->data) && sizeof($stream->data) > 0) {
             $this->logger->logInfo(sizeof($stream->data) . " Facebook posts found on page " . $current_page_number, __METHOD__ . ',' . __LINE__);
             $this->processStream($stream, $network, $current_page_number);
             if (isset($stream->paging->next)) {
                 $next_api_request = $stream->paging->next . '&since=' . $since;
                 $current_page_number++;
             } else {
                 $fetch_next_page = false;
             }
         } elseif (isset($stream->error->type) && $stream->error->type == 'OAuthException') {
             throw new APIOAuthException($stream->error->message);
         } else {
             $this->logger->logInfo("No Facebook posts found for ID {$id}", __METHOD__ . ',' . __LINE__);
             $fetch_next_page = false;
         }
         if (time() > $fetch_stop_time) {
             $fetch_next_page = false;
             $this->logger->logUserInfo("Stopping this service user's crawl because it has exceeded max time of " . $this->max_crawl_time / 60 . " minute(s). ", __METHOD__ . ',' . __LINE__);
         }
     }
 }
Пример #5
0
 /**
  * Expand Bit.ly links and recheck click count on any links less than 2 days old.
  *
  * @param str bitly api key
  * @param str bitly login name
  */
 public function acquireBitlyClickStats($api_key, $bit_login)
 {
     $this->logger->setUsername(null);
     $api_accessor = new BitlyAPIAccessor($api_key, $bit_login);
     $bitly_urls = array('http://bit.ly/', 'http://bitly.com/', 'http://j.mp/');
     foreach ($bitly_urls as $bitly_url) {
         if ($this->link_limit != 0) {
             //all short links first seen in the last 48 hours
             $bitly_links_to_update = $this->short_link_dao->getLinksToUpdate($bitly_url);
             if (count($bitly_links_to_update) > 0) {
                 $this->logger->logUserInfo(count($bitly_links_to_update) . " {$bitly_url}" . " links to acquire click stats for.", __METHOD__ . ',' . __LINE__);
             } else {
                 $this->logger->logUserInfo("There are no " . $bitly_url . " links to fetch click stats for.", __METHOD__ . ',', __LINE__);
             }
             $total_links = 0;
             $total_errors = 0;
             $total_updated = 0;
             foreach ($bitly_links_to_update as $link) {
                 $this->logger->logInfo("Getting bit.ly click stats for " . ($total_updated + 1) . " of " . count($bitly_links_to_update) . " " . $bitly_url . " links (" . $link->short_url . ")", __METHOD__ . ',' . __LINE__);
                 $link_data = $api_accessor->getBitlyLinkData($link->short_url);
                 if ($link_data["clicks"] != '') {
                     //save click total here
                     $this->short_link_dao->saveClickCount($link->short_url, $link_data["clicks"]);
                     // Save title to links table
                     if ($link_data["title"] != '') {
                         $this->link_dao->updateTitle($link->link_id, $link_data["title"]);
                     }
                     $total_links = $total_links + 1;
                     $total_updated = $total_updated + 1;
                 } elseif ($link_data["error"] != '') {
                     $this->link_dao->saveExpansionError($link->short_url, $link_data["error"]);
                     $total_errors = $total_errors + 1;
                     $total_updated = $total_updated + 1;
                 }
             }
             $this->logger->logUserSuccess($total_links . " " . $bitly_url . " link click stats acquired (" . $total_errors . " errors)", __METHOD__ . ',' . __LINE__);
         }
     }
 }
Пример #6
0
 /**
  * This method, and the two supporting private methods 'maintFavsFetch' and 'archivingFavsFetch', provide the
  * primary crawler functionality for adding the user's favorites to the database.
  * For a given user, the process starts in 'archiving mode', by
  * working forwards from the last (oldest) page of tweets to the newest.  This archiving crawl
  * is only done once.  The crawler tries to do this all in one go, but if it exhausts the available API count,
  * it will continue where it left off in the next run.
  * Then, when page 1 is reached in archiving mode, the crawler goes into 'maintenance mode' and works
  * backwards from then on.  It first pages back until
  * it has reached the last fav it previously processed.  Then it searches back N more pages to catch any older
  * tweets that were fav'd out of chronological order, where N is determined by favs_older_pages option.
  * The bookkeeping for these two crawler stages is maintained in the in tu_instances entry for the user.
  *
  * Recently, the Twitter favorites API has developed some bugs that need to be worked around.  The comments below
  * provide more detail, but in a nutshell, these methods can not currently use information from Twitter to
  * calculate loop termination (so a bit more work may be done than necessary), and do not currently remove un-fav'd
  * tweets from the database.  Hopefully these API issues will be fixed by Twitter in future.
  */
 public function fetchInstanceFavorites()
 {
     // first, check that we have the resources to do work
     if (!($this->api->available && $this->api->available_api_calls_for_crawler)) {
         $this->logger->logInfo("terminating fetchInstanceFavorites-- no API calls available", __METHOD__ . ',' . __LINE__);
         return true;
     }
     $status_message = "";
     //@TODO Can we get this from API?
     $page_size = 20;
     // number of favs per page retrieved from the API call
     $this->logger->logUserInfo("Checking for new favorites.", __METHOD__ . ',' . __LINE__);
     $last_favorites_count = $this->instance->favorites_profile;
     $this->logger->logInfo("last favs count: {$last_favorites_count}", __METHOD__ . ',' . __LINE__);
     $last_page_fetched_favorites = $this->instance->last_page_fetched_favorites;
     $last_fav_id = $this->instance->last_favorite_id;
     $curr_favs_count = $this->user->favorites_count;
     $this->logger->logInfo("curr favs count: {$curr_favs_count}", __METHOD__ . ',' . __LINE__);
     $last_page_of_favs = round($this->api->archive_limit / $page_size);
     // under normal circs the latter clause below should never hold, but due to a previously-existing
     // bug that could set a negative last_page_fetched_favorites value in the db in some cases,
     // it is necessary for recovery.
     if ($last_page_fetched_favorites == "" || $last_page_fetched_favorites < 0) {
         $last_page_fetched_favorites = 0;
     }
     $this->logger->logInfo("got last_page_fetched_favorites: {$last_page_fetched_favorites}", __METHOD__ . ',' . __LINE__);
     if ($last_fav_id == "") {
         $last_fav_id = 0;
     }
     // the owner favs count, from twitter, is currently unreliable and may be less than the actual number of
     // favs, by a large margin.  So, we still go ahead and calculate the number of 'missing' tweets based on
     // this info, but currently do not use it for fetch loop termination.
     $this->logger->logInfo("owner favs: " . $this->user->favorites_count . ", instance owner favs in system: " . $this->instance->owner_favs_in_system, __METHOD__ . ',' . __LINE__);
     $favs_missing = $this->user->favorites_count - $this->instance->owner_favs_in_system;
     $this->logger->logInfo("favs missing: {$favs_missing}", __METHOD__ . ',' . __LINE__);
     // figure out if we're in 'archiving' or 'maintenance' mode, via # of last_page_fetched_favorites
     $mode = 0;
     // default is archving/first-fetch
     if ($last_page_fetched_favorites == 1) {
         $mode = 1;
         // we are in maint. mode
         $new_favs_to_add = $favs_missing;
         $this->logger->logInfo("new favs to add/missing: {$new_favs_to_add}", __METHOD__ . ',' . __LINE__);
         $mpage = 1;
         $starting_fav_id = $last_fav_id;
     } else {
         // we are in archiving mode.
         $new_favs_to_add = $curr_favs_count - $last_favorites_count;
         // twitter profile information is not always consistent, so ensure that this value is not negative
         if ($new_favs_to_add < 0) {
             $new_favs_to_add == 0;
         }
         $this->logger->logInfo("new favs to add: {$new_favs_to_add}", __METHOD__ . ',' . __LINE__);
         // figure out start page based on where we left off last time, and how many favs added since then
         $extra_pages = ceil($new_favs_to_add / $page_size);
         $this->logger->logInfo("extra pages: {$extra_pages}", __METHOD__ . ',' . __LINE__);
         $finished_first_fetch = false;
         if ($last_page_fetched_favorites == 0) {
             // if at initial starting fetch (first time favs ever crawled)
             if ($extra_pages == 0) {
                 $extra_pages = 1;
                 // always check at least one page on initial fetch
             }
             $last_page_fetched_favs_start = $extra_pages + 1;
         } else {
             $last_page_fetched_favs_start = $last_page_fetched_favorites + $extra_pages;
         }
         if ($last_page_fetched_favs_start > $last_page_of_favs) {
             $last_page_fetched_favs_start = $last_page_of_favs + 1;
         }
     }
     $status_message = "total last favs count: {$last_favorites_count}" . ", last page fetched: {$last_page_fetched_favorites}, last fav id: {$last_fav_id}";
     $this->logger->logInfo($status_message, __METHOD__ . ',' . __LINE__);
     $this->logger->logInfo("current favs count: {$curr_favs_count}" . ", new favs to add: {$new_favs_to_add}, last page of favs: {$last_page_of_favs}, mode: {$mode}", __METHOD__ . ',' . __LINE__);
     $continue = true;
     $fcount = 0;
     $older_favs_smode = false;
     $stop_page = 0;
     $status_message = "in fetchInstanceFavorites: API available: " . $this->api->available . ", avail for crawler: " . $this->api->available_api_calls_for_crawler;
     $this->logger->logInfo($status_message, __METHOD__ . ',' . __LINE__);
     while ($this->api->available && $this->api->available_api_calls_for_crawler > 0 && $continue) {
         try {
             if ($mode != 0) {
                 // in maintenance, not archiving mode
                 list($fcount, $mpage, $older_favs_smode, $stop_page, $new_favs_to_add, $last_fav_id, $last_page_fetched_favorites, $continue) = $this->maintFavsFetch($starting_fav_id, $fcount, $mpage, $older_favs_smode, $stop_page, $new_favs_to_add, $last_fav_id, $last_page_fetched_favorites, $continue);
                 // }
             } else {
                 // mode 0 -- archiving mode
                 if (!$finished_first_fetch) {
                     list($fcount, $last_fav_id, $last_page_fetched_favorites, $continue) = $this->archivingFavsFetch($fcount, $last_fav_id, $last_page_fetched_favs_start, $continue);
                     $finished_first_fetch = true;
                 } else {
                     list($fcount, $last_fav_id, $last_page_fetched_favorites, $continue) = $this->archivingFavsFetch($fcount, $last_fav_id, $last_page_fetched_favorites, $continue);
                 }
             }
         } catch (APICallLimitExceededException $e) {
             break;
         }
     }
     // end while
     // update necessary instance fields
     $this->logger->logInfo("new_favs_to_add: {$new_favs_to_add}, fcount: {$fcount}", __METHOD__ . ',' . __LINE__);
     $this->logger->logInfo("new 'last fav id': {$last_fav_id}", __METHOD__ . ',' . __LINE__);
     $this->instance->last_favorite_id = $last_fav_id;
     $this->instance->last_page_fetched_favorites = $last_page_fetched_favorites;
     $this->instance->favorites_profile = $curr_favs_count;
     $this->logger->logUserSuccess("Saved {$fcount} new favorites.", __METHOD__ . ',' . __LINE__);
     return true;
 }
Пример #7
0
 /**
  * Fetch and save the posts and replies for the crawler's instance. This function will loop back through the
  * user's or pages archive of posts.
  * @return void
  * @throws APIOAuthException
  */
 public function fetchPostsAndReplies()
 {
     $id = $this->instance->network_user_id;
     $network = $this->instance->network;
     $fetch_next_page = true;
     $current_page_number = 1;
     $next_api_request = $id . '/feed';
     $fields = self::$feed_fields;
     //Cap crawl time for very busy pages with thousands of likes/comments
     $fetch_stop_time = time() + $this->max_crawl_time;
     $api_request_params = null;
     $use_full_api_url = false;
     $dig_into_archives = false;
     while ($fetch_next_page) {
         if (!$use_full_api_url) {
             $stream = FacebookGraphAPIAccessor::apiRequest($next_api_request, $this->access_token, $api_request_params, $fields);
             $api_request_params = null;
         } else {
             //Use full paging URL
             $stream = FacebookGraphAPIAccessor::apiRequestFullURL($next_api_request, $this->access_token);
         }
         if (isset($stream->data) && is_array($stream->data) && sizeof($stream->data) > 0) {
             $this->logger->logInfo(sizeof($stream->data) . " Facebook posts found on page " . $current_page_number, __METHOD__ . ',' . __LINE__);
             $total_added_posts = $this->processStream($stream, $network, $current_page_number);
             if ($total_added_posts == 0) {
                 //No new posts were found, try going back into the archives
                 if (!$dig_into_archives) {
                     $dig_into_archives = true;
                     //Determine 'since', datetime of oldest post in datastore
                     $post_dao = DAOFactory::getDAO('PostDAO');
                     $since_post = $post_dao->getAllPosts($id, $network, 1, 1, true, 'pub_date', 'ASC');
                     $since = isset($since_post[0]) ? $since_post[0]->pub_date : 0;
                     $since = strtotime($since);
                     $this->logger->logInfo("No Facebook posts found for {$id} here, digging into archives since " . $since_post[0]->pub_date . " strtotime " . $since, __METHOD__ . ',' . __LINE__);
                     $api_request_params = array('since' => $since);
                     $use_full_api_url = false;
                     $next_api_request = $id . '/feed';
                 } else {
                     if (isset($stream->paging->next)) {
                         $next_api_request = $stream->paging->next;
                         $use_full_api_url = true;
                         //DEBUG
                         $this->logger->logInfo("Dug into archives, next page API request is " . $next_api_request, __METHOD__ . ',' . __LINE__);
                         $current_page_number++;
                     } else {
                         $fetch_next_page = false;
                     }
                 }
             } else {
                 if (isset($stream->paging->next)) {
                     $next_api_request = $stream->paging->next;
                     $use_full_api_url = true;
                     //DEBUG
                     $this->logger->logInfo("Next page API request is " . $next_api_request, __METHOD__ . ',' . __LINE__);
                     $current_page_number++;
                 } else {
                     $fetch_next_page = false;
                 }
             }
         } elseif (isset($stream->error->type) && $stream->error->type == 'OAuthException') {
             throw new APIOAuthException($stream->error->message);
         } else {
             $this->logger->logInfo("No Facebook posts found for ID {$id}", __METHOD__ . ',' . __LINE__);
             $fetch_next_page = false;
         }
         if (time() > $fetch_stop_time) {
             $fetch_next_page = false;
             $this->logger->logUserInfo("Stopping this service user's crawl because it has exceeded max time of " . $this->max_crawl_time / 60 . " minute(s). ", __METHOD__ . ',' . __LINE__);
         }
     }
 }
Пример #8
0
 /**
  * Fetch and save the posts and replies for the crawler's instance. This function will loop back through the
  * user's or pages archive of posts.
  */
 public function fetchPostsAndReplies()
 {
     $plugin_dao = DAOFactory::getDAO('PluginDAO');
     $plugin_id = $plugin_dao->getPluginId('instagram');
     $namespace = OptionDAO::PLUGIN_OPTIONS . '-' . $plugin_id;
     $id = $this->instance->network_user_id;
     $option_dao = DAOFactory::getDAO('OptionDAO');
     $network = $this->instance->network;
     //Checks if last friends update is over 2 days ago and runs storeFriends if it is.
     $friends_last_updated = $option_dao->getOptionByName($namespace, 'last_crawled_friends');
     $friends_last_updated_check = microtime(true) - 172800;
     if ($friends_last_updated == NULL) {
         $this->storeFriends();
         $option_dao->insertOption($namespace, 'last_crawled_friends', microtime(true));
     } elseif ($friends_last_updated->option_value < $friends_last_updated_check) {
         $this->storeFriends();
         $option_dao->updateOptionByName($namespace, 'last_crawled_friends', microtime(true));
     }
     $fetch_next_page = true;
     $current_page_number = 1;
     $api_param = array();
     if ($this->instance->total_posts_in_system != 0) {
         $last_crawl = $this->instance->crawler_last_run;
         $crawl_less_week = date($last_crawl, strtotime("-1 week"));
         $unix_less_week = strtotime($crawl_less_week);
         $api_param = array('min_timestamp' => $unix_less_week, 'count' => 20);
     } else {
         $api_param = array('count' => 20);
     }
     $this->logger->logUserInfo("About to request media", __METHOD__ . ',' . __LINE__);
     $posts = InstagramAPIAccessor::apiRequest('media', $id, $this->access_token, $api_param);
     $this->logger->logUserInfo("Media requested", __METHOD__ . ',' . __LINE__);
     //Cap crawl time for very busy pages with thousands of likes/comments
     $fetch_stop_time = time() + $this->max_crawl_time;
     //Determine 'since', datetime of oldest post in datastore
     $post_dao = DAOFactory::getDAO('PostDAO');
     $since_post = $post_dao->getAllPosts($id, $network, 1, 1, true, 'pub_date', 'ASC');
     $since = isset($since_post[0]) ? $since_post[0]->pub_date : 0;
     $since = strtotime($since) - 60 * 60 * 24;
     // last post minus one day, just to be safe
     if ($since < 0) {
         $since = 0;
     } else {
         $since = $since;
     }
     while ($fetch_next_page) {
         if ($posts->count() > 0) {
             $this->logger->logInfo(sizeof($stream->data) . " Instagram posts found on page " . $current_page_number, __METHOD__ . ',' . __LINE__);
             $this->processPosts($posts, $network, $current_page_number);
             if ($posts->getNext() != null) {
                 $api_param['max_id'] = $posts->getNext();
                 $posts = InstagramAPIaccessor::apiRequest('media', $id, $this->access_token, $api_param);
                 $current_page_number++;
             } else {
                 $fetch_next_page = false;
             }
         } else {
             $this->logger->logInfo("No Instagram posts found for ID {$id}", __METHOD__ . ',' . __LINE__);
             $fetch_next_page = false;
         }
         if (time() > $fetch_stop_time) {
             $fetch_next_page = false;
             $this->logger->logUserInfo("Stopping this service user's crawl because it has exceeded max time of " . $this->max_crawl_time / 60 . " minute(s). ", __METHOD__ . ',' . __LINE__);
         }
     }
 }
Пример #9
0
 /**
  *  Collects and stores information about the users videos from the YouTube APIs
  *  Currently collects and stores:
  *   - Basic video information such as title, author, description and location the video was shot in (if available)
  *  - Replies to the video
  *      -- This uses the YouTube V2 API due to the V3 API currently not supporting replies
  *   - All time counts for likes, dislikes, views, average view duration, average view percentage, favorites added,
  *   favorites removed, shares, subscribers gained and subscribers lost
  *     -- The totals for these are stored in the videos table, a history of these totals is stored in the
  *     count_history table under a type of [metric]_all_time and date of todays date
  *    -- A record of these metrics for indivdual days is also saved in the count_history table under a type of
  *    [metric] and date of the day the metric represents usually two days ago due to a delay in the availability
  *      of data from the Analytics API
  * @return null
  */
 public function fetchInstanceUserVideos()
 {
     $video_dao = DAOFactory::getDAO('VideoDAO');
     $user_dao = DAOFactory::getDAO('UserDAO');
     $post_dao = DAOFactory::getDAO('PostDAO');
     $count_history_dao = DAOFactory::getDAO('CountHistoryDAO');
     $instance_dao = DAOFactory::getDAO('InstanceDAO');
     // Get the users upload playlist ID
     $fields_for_ids = array('part' => 'contentDetails,statistics', 'mine' => 'true');
     $various_ids = $this->youtube_api_accessor->apiRequest('channels', $this->access_token, $fields_for_ids);
     $upload_id = $various_ids->items[0]->contentDetails->relatedPlaylists->uploads;
     // Also get their channel ID as we'll need it later on
     $channel_id = $various_ids->items[0]->id;
     // There are some required attributes about the author that YouTube doesn't return for the videos so we need
     // to query the database for them
     $author_details = $user_dao->getDetails($this->instance->network_user_id, 'youtube');
     $user_id = $this->instance->network_user_id;
     // Update the users subscriber count
     $subscriber_count = $various_ids->items[0]->statistics->subscriberCount;
     $author_details->follower_count = $subscriber_count;
     $user_dao->updateUser($author_details);
     $count_history_dao->insert($user_id, 'youtube', $subscriber_count, null, 'subscriber_count');
     // Calculate the time at which we should stop fetching videos
     $end_time = time() + $this->max_crawl_time;
     // Keep track of if we finished the crawl early due to timing out
     $had_to_finish_early = false;
     // Check if we already loaded all the old posts for this user
     $archive_loaded = $instance->is_archive_loaded_posts;
     // If the archive isn't loaded yet keep track of how many times we've tried to load it
     if (!$archive_loaded) {
         $attempts = $count_history_dao->getLatestCountByNetworkUserIDAndType($user_id, 'youtube', 'youtube_archive_attempts');
         if ($attempts == null) {
             // If this is the first crawler run
             $attempts['count'] = 0;
         }
         $attempts['count']++;
         $count_history_dao->insert($user_id, 'youtube', $attempts['count'], null, 'youtube_archive_attempts', null);
     }
     // Now page through their videos collecting the data
     $videos_fields = array('part' => 'snippet', 'maxResults' => '25', 'playlistId' => $upload_id, 'pageToken' => null);
     // We may get multiple pages
     do {
         // This is a page of IDs of videos the user has uploaded
         $user_videos = $this->youtube_api_accessor->apiRequest('playlistItems', $this->access_token, $videos_fields);
         // For each video store the relevant details about it
         foreach ($user_videos->items as $video) {
             // If we've hit the max crawl time stop
             if (time() >= $end_time) {
                 $this->logger->logUserInfo("Stopping this service users crawl because it has exceeded max time of " . $this->max_crawl_time / 60 . " minute(s). ", __METHOD__ . ',' . __LINE__);
                 $had_to_finish_early = true;
                 break 2;
             }
             $video_id = $video->snippet->resourceId->videoId;
             // Get the title, description, likes, dislikes, views, and details about where
             // the video was taken from the data API
             $video_fields = array('id' => $video_id, 'part' => 'statistics,id,snippet,recordingDetails,status');
             $video_details = $this->youtube_api_accessor->apiRequest('videos', $this->access_token, $video_fields);
             $item = $video_details->items[0];
             // Check we haven't used up our quota
             if (isset($video_details->error)) {
                 $this->logger->logError('Error querying YouTube Data API V3 ', __METHOD__ . ',' . __LINE__);
                 break;
             }
             $video_attributes['post_text'] = $item->snippet->title;
             $video_attributes['description'] = $item->snippet->description;
             $video_attributes['likes'] = $item->statistics->likeCount;
             $video_attributes['dislikes'] = $item->statistics->dislikeCount;
             $video_attributes['views'] = $item->statistics->viewCount;
             // Keep track of these all time counts
             $count_history_dao->insert($user_id, 'youtube', $video_attributes['likes'], $video_id, 'likes_all_time');
             $count_history_dao->insert($user_id, 'youtube', $video_attributes['dislikes'], $video_id, 'dislikes_all_time');
             $count_history_dao->insert($user_id, 'youtube', $video_attributes['views'], $video_id, 'views_all_time');
             $video_attributes['pub_date'] = $item->snippet->publishedAt;
             $video_attributes['post_id'] = $item->id;
             $video_attributes['location'] = $item->recordingDetails->locationDescription;
             $video_attributes['place'] = $item->recordingDetails->locationDescription;
             if (isset($item->recordingDetails->latitude)) {
                 $video_attributes['geo'] = $item->recordingDetails->latitude . "," . $item->recordingDetails->longitude;
             }
             $video_attributes['is_protected'] = self::determinePrivacyStatus($item->status->privacyStatus);
             $today = date('Y-m-d');
             $upload_date = substr($item->snippet->publishedAt, 0, 10);
             // Get the favourites added, favourites removed, shares, subscribers gained, subscribers lost
             // estimated minuites watched, average view duration, average view percentage
             $analytics_fields = array('ids' => 'channel==' . $channel_id, 'start-date' => $upload_date, 'end-date' => $today, 'metrics' => 'favoritesAdded,favoritesRemoved,shares,subscribersGained,subscribersLost,' . 'estimatedMinutesWatched,averageViewDuration,averageViewPercentage,views,likes,dislikes', 'filters' => 'video==' . $video_id);
             $video_analytics_details = $this->youtube_analytics_api_accessor->apiRequest('reports', $this->access_token, $analytics_fields);
             // Check we haven't used up our quota
             if (isset($video_analytics_details->error)) {
                 $this->logger->logError('Error querying YouTube Analytics API', __METHOD__ . ',' . __LINE__);
                 break;
             }
             $analytics_item = $video_analytics_details->rows[0];
             // If the video is new we may not get any of these values back, but they can't be null
             if (isset($analytics_item)) {
                 $video_attributes['favorites_added'] = $analytics_item[0];
                 $video_attributes['favorites_removed'] = $analytics_item[1];
                 $video_attributes['shares'] = $analytics_item[2];
                 $video_attributes['subscribers_gained'] = $analytics_item[3];
                 $video_attributes['subscribers_lost'] = $analytics_item[4];
                 $video_attributes['minutes_watched'] = $analytics_item[5];
                 $video_attributes['average_view_duration'] = $analytics_item[6];
                 $video_attributes['average_view_percentage'] = $analytics_item[7];
                 // Keep track of these all time counts
                 $count_history_dao->insert($user_id, 'youtube', $analytics_item[0], $video_id, 'favorites_added_all_time');
                 $count_history_dao->insert($user_id, 'youtube', $analytics_item[1], $video_id, 'favorites_removed_all_time');
                 $count_history_dao->insert($user_id, 'youtube', $analytics_item[2], $video_id, 'shares_all_time');
                 $count_history_dao->insert($user_id, 'youtube', $analytics_item[3], $video_id, 'subscribers_gained_all_time');
                 $count_history_dao->insert($user_id, 'youtube', $analytics_item[4], $video_id, 'subscribers_lost_all_time');
                 $count_history_dao->insert($user_id, 'youtube', $analytics_item[5], $video_id, 'minutes_watched_all_time');
                 $count_history_dao->insert($user_id, 'youtube', $analytics_item[6], $video_id, 'average_view_duration_all_time');
                 $count_history_dao->insert($user_id, 'youtube', $analytics_item[7], $video_id, 'average_view_percentage_all_time');
             } else {
                 // So set them the 0
                 $video_attributes['favorites_added'] = 0;
                 $video_attributes['favorites_removed'] = 0;
                 $video_attributes['shares'] = 0;
                 $video_attributes['subscribers_gained'] = 0;
                 $video_attributes['subscribers_lost'] = 0;
                 $video_attributes['minutes_watched'] = 0;
                 $video_attributes['average_view_duration'] = 0;
                 $video_attributes['average_view_percentage'] = 0;
             }
             $video_attributes['author_user_id'] = $this->instance->network_user_id;
             $video_attributes['author_username'] = $this->instance->network_username;
             $video_attributes['author_fullname'] = $author_details->full_name;
             $video_attributes['author_avatar'] = $author_details->avatar;
             $video_attributes['source'] = '';
             $video_attributes['network'] = 'youtube';
             $video_dao->addVideo($video_attributes);
             // Now collect per day count data for 2 days ago (testing has shown analytics data is delayed by 2 days)
             $two_days_ago = date('Y-m-d', strtotime("-2 day", strtotime($today)));
             $analytics_fields['start-date'] = $two_days_ago;
             $analytics_fields['end-date'] = $two_days_ago;
             $analytics_today_details = $this->youtube_analytics_api_accessor->apiRequest('reports', $this->access_token, $analytics_fields);
             // Check we haven't used up our quota
             if (isset($analytics_today_details->error)) {
                 $this->logger->logError('Error querying YouTube Analytics API', __METHOD__ . ',' . __LINE__);
                 break;
             }
             $todays_analytics = $analytics_today_details->rows[0];
             // Check we got data and if not skip this part
             if (isset($todays_analytics)) {
                 $count_history_dao->insert($user_id, 'youtube', $todays_analytics[0], $video_id, 'favorites_added', $two_days_ago);
                 $count_history_dao->insert($user_id, 'youtube', $todays_analytics[1], $video_id, 'favorites_removed', $two_days_ago);
                 $count_history_dao->insert($user_id, 'youtube', $todays_analytics[2], $video_id, 'shares', $two_days_ago);
                 $count_history_dao->insert($user_id, 'youtube', $todays_analytics[3], $video_id, 'subscribers_gained', $two_days_ago);
                 $count_history_dao->insert($user_id, 'youtube', $todays_analytics[4], $video_id, 'subscribers_lost', $two_days_ago);
                 $count_history_dao->insert($user_id, 'youtube', $todays_analytics[5], $video_id, 'minutes_watched', $two_days_ago);
                 $count_history_dao->insert($user_id, 'youtube', $todays_analytics[6], $video_id, 'average_view_duration', $two_days_ago);
                 $count_history_dao->insert($user_id, 'youtube', $todays_analytics[7], $video_id, 'average_view_percentage', $two_days_ago);
                 $count_history_dao->insert($user_id, 'youtube', $todays_analytics[8], $video_id, 'views', $two_days_ago);
                 $count_history_dao->insert($user_id, 'youtube', $todays_analytics[9], $video_id, 'likes', $two_days_ago);
                 $count_history_dao->insert($user_id, 'youtube', $todays_analytics[10], $video_id, 'dislikes', $two_days_ago);
             }
             // Check to see how many comments we already have for this video and if there are no new ones skip
             // comment collection as it takes a long time.
             $video_in_db = $video_dao->getVideoByID($video_id, 'youtube');
             $comments_in_db = $video_in_db->reply_count_cache;
             if (!isset($comments_in_db)) {
                 $comments_in_db = 0;
             }
             $api_comments = $item->statistics->commentCount;
             $comments_collected = 0;
             // if this video has any new comments capture those
             if ($api_comments > 0 && $api_comments > $comments_in_db) {
                 // Request the first page of comments for this video
                 $comments_fields = array('alt' => 'json');
                 if (isset($this->developer_key)) {
                     $comments_fields['key'] = $this->developer_key;
                 }
                 $comments = $this->youtube_api_v2_accessor->apiRequest('videos/' . $video_id . '/comments', $comments_fields);
                 // Check we haven't used up our quota
                 if (isset($comments->errors)) {
                     $this->logger->logError('Error querying YouTube Data API V2 ', __METHOD__ . ',' . __LINE__);
                     break;
                 }
                 do {
                     // Iterate through each comment and store the details
                     foreach ($comments->feed->entry as $comment) {
                         // We may have only needed to collect a few new comments so abort if we have everything
                         if ($api_comments == $comments_in_db) {
                             break 2;
                         }
                         // If the user has specified a limit on the number of comments per video to collect each
                         // crawl check we haven't exceeded it
                         if (isset($this->maximum_comments) && $comments_collected >= $this->maximum_comments) {
                             $this->logger->logUserInfo("Stopping collection of comments for video due to reaching " . "limit of " . $this->maximum_comments . " comments.", __METHOD__ . ',' . __LINE__);
                             break 2;
                         }
                         // We may spend a long time collecting comments so also check here if we've exceed the max
                         // time specified by the user
                         if (time() >= $end_time) {
                             $this->logger->logUserInfo("Stopping this service users crawl because it has exceeded " . "max time of " . $this->max_crawl_time / 60 . " minute(s). ", __METHOD__ . ',' . __LINE__);
                             $had_to_finish_early = true;
                             break 4;
                         }
                         // The id is returned in the XML as part of a long URL, we only want the last part of that
                         // URL
                         $id_string = explode('/', $comment->id->{'$t'});
                         // This will be the last element of id_string
                         $comment_store['post_id'] = $id_string[sizeof($id_string) - 1];
                         // The post text is the comment they made
                         // Remove byte order markers from the comment text from:
                         // http://stackoverflow.com/questions/3255993/how-do-i-remove-i-from-the-beginning
                         // -of-a-file#comment9330944_3256183
                         $comment_store['post_text'] = preg_replace('/\\x{EF}\\x{BB}\\x{BF}/', '', $comment->content->{'$t'});
                         // The author username is the users G+ displayname which we need to query for
                         // To get the G+ ID of this commentor we need to vist their youtube profile page, the ID
                         // needed to get to this users page is the last element of the author URI
                         $user_id_string = explode('/', $comment->author[0]->uri->{'$t'});
                         $name = $this->youtube_api_v2_accessor->apiRequest('users/' . $user_id_string[sizeof($user_id_string) - 1], $comments_fields);
                         $gplus_id = $name->entry->{'yt$googlePlusUserId'}->{'$t'};
                         // // Now we have their G+ ID we can get their details from the G+ API
                         $gplus_fields = array('fields' => 'displayName,id,image,tagline,verified');
                         $user_details = $this->google_plus_api_accessor->apiRequest('people/' . $gplus_id, $this->access_token, $gplus_fields);
                         // Sometimes G+ says the ID is invalid or the user doesn't have a G+ ID
                         if ($user_details->error->code == '404' || $gplus_id == '') {
                             // Use V2 of the YouTube api to get their details
                             $comment_store['author_username'] = $name->entry->{'yt$username'}->{'$t'};
                             $comment_store['author_fullname'] = $name->entry->author[0]->name->{'$t'};
                             $comment_store["author_avatar"] = $name->entry->{'media$thumbnail'}->url;
                             // In this case the user id is their YouTube user ID
                             $comment_store['author_user_id'] = $user_id_string[sizeof($user_id_string) - 1];
                             self::fetchUserFromYouTube($user_id_string[sizeof($user_id_string) - 1], 'youtube_crawler');
                             // If we still didn't get these details we can't store this comment
                             if ($comment_store['author_username'] == null || $comment_store['author_fullname'] == null || $comment_store["author_avatar"] == null) {
                                 continue;
                             }
                         } elseif (isset($user_details->error)) {
                             //Check we haven't exceed the G+ API quota
                             $this->logger->logError('Error querying Google Plus API ', __METHOD__ . ',' . __LINE__);
                             break;
                         } else {
                             $comment_store['author_username'] = $user_details->displayName;
                             $comment_store['author_fullname'] = $user_details->displayName;
                             $comment_store["author_avatar"] = $user_details->image->url;
                             // The author user id is their G+ ID
                             $comment_store['author_user_id'] = $gplus_id;
                             // Make sure we have this commentor in the database
                             self::fetchUser($gplus_id, 'youtube crawler');
                         }
                         // The date they posted the comment
                         $comment_store['pub_date'] = substr($comment->published->{'$t'}, 0, 10) . " " . substr($comment->published->{'$t'}, 11, 8);
                         // Source of the comment
                         $comment_store['source'] = "";
                         // Comments can not be private
                         $comment_store['is_protected'] = false;
                         // Set the network to youtube
                         $comment_store['network'] = 'youtube';
                         // The ID of the author of the video
                         $comment_store['in_reply_to_user_id'] = $this->instance->network_user_id;
                         // The ID of the video this comment is a reply to
                         $comment_store['in_reply_to_post_id'] = $video_id;
                         $insert_id = $post_dao->addPost($comment_store);
                         // If the insert id is null and were not going back to collect the whole archive
                         // we've already captured comments from this point so move on
                         if ($insert_id == null && $archive_loaded) {
                             break 2;
                         }
                         $comments_in_db++;
                         $comments_collected++;
                     }
                     $test = self::determineIfMoreCommentsExist($comments, $video_id);
                     // If there is another page of comments make a request for them
                     if ($test['next']) {
                         $comments = $this->youtube_api_v2_accessor->basicApiRequest($test['url']);
                         // Check we haven't used up our quota
                         if (isset($comments->errors)) {
                             $this->logger->logError('Error querying YouTube Data API V2 ', __METHOD__ . ',' . __LINE__);
                             break;
                         } elseif ($comments == null) {
                             // If the comments come back as null its because we've been making too many requests too
                             // quickly The YouTube api doesn't return valid JSON telling us this though so
                             // json_decode returns null so back off for 30 seconds and then try again
                             $error_message = "Querying the YouTube API too often waiting for 30 seconds, to ";
                             $error_message .= "prevent this delay add a developer key.";
                             $this->logger->logError($error_message, __METHOD__ . ',' . __LINE__);
                             sleep(30);
                             $comments = $this->youtube_api_v2_accessor->basicApiRequest($test['url']);
                         }
                     }
                 } while ($test['next']);
             }
             // If we have another page of videos then get the token for the page
             if (isset($user_videos->nextPageToken)) {
                 $videos_fields['pageToken'] = $user_videos->nextPageToken;
             }
         }
         // If we have another page of videos and haven't loaded all this users video yet keep going
         // if we have loaded all this users videos then stop after 1 page (50 videos)
     } while (isset($user_videos->nextPageToken) && !$archive_loaded);
     // If we didn't have to finish the crawl early due to timing out we have collected all this users videos or
     // we have tried more than 20 times stop trying to go back and load the post archive
     if (!$had_to_finish_early || $attempts >= 20) {
         $instance_dao->setPostArchiveLoaded($user_id, 'youtube');
     }
 }
Пример #10
0
 /**
  * Convert parsed JSON of a profile or page's posts into ThinkUp posts and users
  * @param Object $stream
  * @param str $source The network for the post; by default 'facebook'
  */
 private function processStream($stream, $network)
 {
     $thinkup_posts = array();
     $total_added_posts = 0;
     $thinkup_users = array();
     $total_added_users = 0;
     $thinkup_links = array();
     $total_links_added = 0;
     $thinkup_likes = array();
     $total_added_likes = 0;
     $profile = null;
     $post_dao = DAOFactory::getDAO('PostDAO');
     $must_process_likes = true;
     $must_process_comments = true;
     foreach ($stream->data as $p) {
         $post_id = explode("_", $p->id);
         $post_id = $post_id[1];
         if ($profile == null) {
             $profile = $this->fetchUserInfo($p->from->id, $network, 'Post stream');
         }
         //assume profile comments are private and page posts are public
         $is_protected = $network == 'facebook' ? 1 : 0;
         //get likes count
         $likes_count = 0;
         if (isset($p->likes)) {
             if (is_int($p->likes)) {
                 $likes_count = $p->likes;
             } elseif (isset($p->likes->count) && is_int($p->likes->count)) {
                 $likes_count = $p->likes->count;
             }
         }
         //Figure out if we have to process likes and comments
         $post_in_storage = $post_dao->getPost($post_id, $network);
         if (isset($post_in_storage)) {
             if ($post_in_storage->favlike_count_cache >= $likes_count) {
                 $must_process_likes = false;
                 $this->logger->logInfo("Already have " . $likes_count . " likes for post ID " . $post_id . "; Skipping like processing this crawler run", __METHOD__ . ',' . __LINE__);
             }
             if (isset($p->comments->count)) {
                 if ($post_in_storage->reply_count_cache >= $p->comments->count) {
                     $must_process_comments = false;
                     $this->logger->logInfo("Already have " . $p->comments->count . " comments for post ID " . $post_id . "; Skipping comments processing", __METHOD__ . ',' . __LINE__);
                 }
             }
         }
         if (isset($profile) && !isset($post_in_storage)) {
             $posts_to_process = array("post_id" => $post_id, "author_username" => $profile->username, "author_fullname" => $profile->username, "author_avatar" => $profile->avatar, "author_user_id" => $p->from->id, "post_text" => isset($p->message) ? $p->message : '', "pub_date" => $p->created_time, "favlike_count_cache" => $likes_count, "in_reply_to_user_id" => '', "in_reply_to_post_id" => '', "source" => '', 'network' => $network, 'is_protected' => $is_protected, 'location' => $profile->location);
             array_push($thinkup_posts, $posts_to_process);
             $total_added_posts = $total_added_posts + $this->storePostsAndAuthors($thinkup_posts, "Owner stream");
             //free up memory
             $thinkup_posts = array();
             if (isset($p->source) || isset($p->link)) {
                 // there's a link to store
                 $link_url = isset($p->source) ? $p->source : $p->link;
                 $link = new Link(array("url" => $link_url, "expanded_url" => $link_url, "image_src" => isset($p->picture) ? $p->picture : '', "caption" => isset($p->caption) ? $p->caption : '', "description" => isset($p->description) ? $p->description : '', "title" => isset($p->name) ? $p->name : '', "network" => $network, "post_id" => $post_id));
                 array_push($thinkup_links, $link);
             }
             $total_links_addded = $total_links_added + $this->storeLinks($thinkup_links);
             if ($total_links_added > 0) {
                 $this->logger->logUserSuccess("Collected {$total_links_added} new links", __METHOD__ . ',' . __LINE__);
             }
             //free up memory
             $thinkup_links = array();
         }
         if ($must_process_comments) {
             if (isset($p->comments)) {
                 $comments_captured = 0;
                 if (isset($p->comments->data)) {
                     $post_comments = $p->comments->data;
                     $post_comments_count = isset($post_comments) ? sizeof($post_comments) : 0;
                     if (is_array($post_comments) && sizeof($post_comments) > 0) {
                         foreach ($post_comments as $c) {
                             if (isset($c->from)) {
                                 $comment_id = explode("_", $c->id);
                                 $comment_id = $comment_id[2];
                                 //Get posts
                                 $posts_to_process = array("post_id" => $comment_id, "author_username" => $c->from->name, "author_fullname" => $c->from->name, "author_avatar" => 'https://graph.facebook.com/' . $c->from->id . '/picture', "author_user_id" => $c->from->id, "post_text" => $c->message, "pub_date" => $c->created_time, "in_reply_to_user_id" => $profile->user_id, "in_reply_to_post_id" => $post_id, "source" => '', 'network' => $network, 'is_protected' => $is_protected, 'location' => '');
                                 array_push($thinkup_posts, $posts_to_process);
                                 $comments_captured = $comments_captured + 1;
                             }
                         }
                     }
                 }
                 $total_added_posts = $total_added_posts + $this->storePostsAndAuthors($thinkup_posts, "Post stream comments");
                 //free up memory
                 $thinkup_posts = array();
                 // collapsed comment thread
                 if (isset($p->comments->count) && $p->comments->count > $comments_captured) {
                     $api_call = 'https://graph.facebook.com/' . $p->from->id . '_' . $post_id . '/comments?access_token=' . $this->access_token;
                     do {
                         $comments_stream = FacebookGraphAPIAccessor::rawApiRequest($api_call);
                         if (isset($comments_stream) && is_array($comments_stream->data)) {
                             foreach ($comments_stream->data as $c) {
                                 if (isset($c->from)) {
                                     $comment_id = explode("_", $c->id);
                                     $comment_id = $comment_id[sizeof($comment_id) - 1];
                                     //Get posts
                                     $posts_to_process = array("post_id" => $comment_id, "author_username" => $c->from->name, "author_fullname" => $c->from->name, "author_avatar" => 'https://graph.facebook.com/' . $c->from->id . '/picture', "author_user_id" => $c->from->id, "post_text" => $c->message, "pub_date" => $c->created_time, "in_reply_to_user_id" => $profile->user_id, "in_reply_to_post_id" => $post_id, "source" => '', 'network' => $network, 'is_protected' => $is_protected, 'location' => '');
                                     array_push($thinkup_posts, $posts_to_process);
                                 }
                             }
                             $total_added_posts = $total_added_posts + $this->storePostsAndAuthors($thinkup_posts, "Posts stream comments collapsed");
                             //free up memory
                             $thinkup_posts = array();
                             if (isset($comments_stream->paging->next)) {
                                 $api_call = str_replace('\\u00257C', '|', $comments_stream->paging->next);
                             }
                         } else {
                             // no comments (pun intended)
                             break;
                         }
                     } while (isset($comments_stream->paging->next));
                 }
             }
         }
         //process "likes"
         if ($must_process_likes) {
             if (isset($p->likes)) {
                 $likes_captured = 0;
                 if (isset($p->likes->data)) {
                     $post_likes = $p->likes->data;
                     $post_likes_count = isset($post_likes) ? sizeof($post_likes) : 0;
                     if (is_array($post_likes) && sizeof($post_likes) > 0) {
                         foreach ($post_likes as $l) {
                             if (isset($l->name) && isset($l->id)) {
                                 //Get users
                                 $ttu = array("user_name" => $l->name, "full_name" => $l->name, "user_id" => $l->id, "avatar" => 'https://graph.facebook.com/' . $l->id . '/picture', "location" => '', "description" => '', "url" => '', "is_protected" => 1, "follower_count" => 0, "post_count" => 0, "joined" => '', "found_in" => "Likes", "network" => 'facebook');
                                 //Users are always set to network=facebook
                                 array_push($thinkup_users, $ttu);
                                 $fav_to_add = array("favoriter_id" => $l->id, "network" => $network, "author_user_id" => $profile->user_id, "post_id" => $post_id);
                                 array_push($thinkup_likes, $fav_to_add);
                                 $likes_captured = $likes_captured + 1;
                             }
                         }
                     }
                 }
                 $total_added_users = $total_added_users + $this->storeUsers($thinkup_users, "Likes");
                 $total_added_likes = $total_added_likes + $this->storeLikes($thinkup_likes);
                 //free up memory
                 $thinkup_users = array();
                 $thinkup_likes = array();
                 // collapsed likes
                 if (isset($p->likes->count) && $p->likes->count > $likes_captured) {
                     $api_call = 'https://graph.facebook.com/' . $p->from->id . '_' . $post_id . '/likes?access_token=' . $this->access_token;
                     do {
                         $likes_stream = FacebookGraphAPIAccessor::rawApiRequest($api_call);
                         if (isset($likes_stream) && is_array($likes_stream->data)) {
                             foreach ($likes_stream->data as $l) {
                                 if (isset($l->name) && isset($l->id)) {
                                     //Get users
                                     $ttu = array("user_name" => $l->name, "full_name" => $l->name, "user_id" => $l->id, "avatar" => 'https://graph.facebook.com/' . $l->id . '/picture', "location" => '', "description" => '', "url" => '', "is_protected" => 1, "follower_count" => 0, "post_count" => 0, "joined" => '', "found_in" => "Likes", "network" => 'facebook');
                                     //Users are always set to network=facebook
                                     array_push($thinkup_users, $ttu);
                                     $fav_to_add = array("favoriter_id" => $l->id, "network" => $network, "author_user_id" => $p->from->id, "post_id" => $post_id);
                                     array_push($thinkup_likes, $fav_to_add);
                                     $likes_captured = $likes_captured + 1;
                                 }
                             }
                             $total_added_users = $total_added_users + $this->storeUsers($thinkup_users, "Likes");
                             $total_added_likes = $total_added_likes + $this->storeLikes($thinkup_likes);
                             //free up memory
                             $thinkup_users = array();
                             $thinkup_likes = array();
                             if (isset($likes_stream->paging->next)) {
                                 $api_call = str_replace('\\u00257C', '|', $likes_stream->paging->next);
                             }
                         } else {
                             // no likes
                             break;
                         }
                     } while (isset($likes_stream->paging->next));
                 }
             }
             //free up memory
             $thinkup_users = array();
             $thinkup_likes = array();
         }
     }
     if ($total_added_posts > 0) {
         $this->logger->logUserSuccess("Collected {$total_added_posts} posts", __METHOD__ . ',' . __LINE__);
     } else {
         $this->logger->logUserInfo("No new posts found.", __METHOD__ . ',' . __LINE__);
     }
     if ($total_added_users > 0) {
         $this->logger->logUserSuccess("Collected {$total_added_users} users", __METHOD__ . ',' . __LINE__);
     } else {
         $this->logger->logUserInfo("No new users found.", __METHOD__ . ',' . __LINE__);
     }
     if ($total_added_likes > 0) {
         $this->logger->logUserSuccess("Collected {$total_added_likes} likes", __METHOD__ . ',' . __LINE__);
     } else {
         $this->logger->logUserInfo("No new likes found.", __METHOD__ . ',' . __LINE__);
     }
 }
Пример #11
0
    /**
     * This method, and the two supporting private methods 'maintFavsFetch' and 'archivingFavsFetch', provide the
     * primary crawler functionality for adding the user's favorites to the database.
     * For a given user, the process starts in 'archiving mode', by
     * working forwards from the last (oldest) page of tweets to the newest.  This archiving crawl
     * is only done once.  The crawler tries to do this all in one go, but if it exhausts the available API count,
     * it will continue where it left off in the next run.
     * Then, when page 1 is reached in archiving mode, the crawler goes into 'maintenance mode' and works
     * backwards from then on.  It first pages back until
     * it has reached the last fav it previously processed.  Then it searches back N more pages to catch any older
     * tweets that were fav'd out of chronological order, where N is determined by favs_older_pages option.
     * The bookkeeping for these two crawler stages is maintained in the in tu_instances entry for the user.
     *
     * Recently, the Twitter favorites API has developed some bugs that need to be worked around.  The comments below
     * provide more detail, but in a nutshell, these methods can not currently use information from Twitter to
     * calculate loop termination (so a bit more work may be done than necessary), and do not currently remove un-fav'd
     * tweets from the database.  Hopefully these API issues will be fixed by Twitter in future.
     */
    public function fetchInstanceFavorites() {
        // first, check that we have the resources to do work
        if (!($this->api->available && $this->api->available_api_calls_for_crawler)) {
            $this->logger->logInfo("terminating fetchInstanceFavorites-- no API calls available",
            __METHOD__.','.__LINE__);
            return true;
        }

        $status_message = "";
        //@TODO Can we get this from API?
        $page_size = 20; // number of favs per page retrieved from the API call

        $this->logger->logUserInfo("Checking for new favorites.", __METHOD__.','.__LINE__);

        $last_favorites_count = $this->instance->favorites_profile;
        $this->logger->logInfo("last favs count: $last_favorites_count", __METHOD__.','.__LINE__);
        $last_page_fetched_favorites = $this->instance->last_page_fetched_favorites;
        $last_fav_id = $this->instance->last_favorite_id;
        $curr_favs_count = $this->user->favorites_count;
        $this->logger->logInfo("curr favs count: $curr_favs_count", __METHOD__.','.__LINE__);

        $last_page_of_favs = round($this->api->archive_limit / $page_size);

        if ($last_page_fetched_favorites == "") {
            $last_page_fetched_favorites = 0;
        }
        $this->logger->logInfo("got last_page_fetched_favorites: $last_page_fetched_favorites",
        __METHOD__.','.__LINE__);
        if ($last_fav_id == "") {
            $last_fav_id = 0;
        }

        // the owner favs count, from twitter, is currently unreliable and may be less than the actual number of
        // favs, by a large margin.  So, we still go ahead and calculate the number of 'missing' tweets based on
        // this info, but currently do not use it for fetch loop termination.
        $this->logger->logInfo("owner favs: " . $this->user->favorites_count . ", instance owner favs in system: ".
        $this->instance->owner_favs_in_system, __METHOD__.','.__LINE__);
        $favs_missing = $this->user->favorites_count - $this->instance->owner_favs_in_system;
        $this->logger->logInfo("favs missing: $favs_missing", __METHOD__.','.__LINE__);

        // figure out if we're in 'archiving' or 'maintenance' mode, via # of last_page_fetched_favorites
        $mode = 0; // default is archving/first-fetch
        if ($last_page_fetched_favorites == 1) {
            $mode = 1; // we are in maint. mode
            $new_favs_to_add = $favs_missing;
            $this->logger->logInfo("new favs to add/missing: $new_favs_to_add", __METHOD__.','.__LINE__);
            $mpage = 1;
            $starting_fav_id = $last_fav_id;
        } else {
            // we are in archiving mode.
            $new_favs_to_add = $curr_favs_count - $last_favorites_count;
            $this->logger->logInfo("new favs to add: $new_favs_to_add", __METHOD__.','.__LINE__);

            // figure out start page based on where we left off last time, and how many favs added since then
            $extra_pages = ceil($new_favs_to_add / $page_size);
            $this->logger->logInfo("extra pages: $extra_pages", __METHOD__.','.__LINE__);
            $finished_first_fetch = false;
            if ($last_page_fetched_favorites == 0) {
                // if at initial starting fetch (first time favs ever crawled)
                $last_page_fetched_favs_start = $extra_pages + 1;
            } else {
                $last_page_fetched_favs_start = $last_page_fetched_favorites + $extra_pages;
            }
            if ($last_page_fetched_favs_start > $last_page_of_favs) {
                $last_page_fetched_favs_start = $last_page_of_favs + 1;
            }
        }

        $status_message = "total last favs count: $last_favorites_count" .
           ", last page fetched: $last_page_fetched_favorites, last fav id: $last_fav_id";
        $this->logger->logInfo($status_message, __METHOD__.','.__LINE__);
        $this->logger->logInfo("current favs count: $curr_favs_count" .
               ", new favs to add: $new_favs_to_add, last page of favs: $last_page_of_favs, mode: $mode", 
        __METHOD__.','.__LINE__);

        $continue = true;
        $fcount = 0;
        $older_favs_smode = false;
        $stop_page = 0;

        $status_message = "in fetchInstanceFavorites: API available: ".$this->api->available.", avail for crawler: ".
        $this->api->available_api_calls_for_crawler;
        $this->logger->logInfo($status_message, __METHOD__.','.__LINE__);

        while ($this->api->available && $this->api->available_api_calls_for_crawler > 0 && $continue) {
            if ($mode != 0) { // in maintenance, not archiving mode
                list($fcount, $mpage, $older_favs_smode, $stop_page, $new_favs_to_add, $last_fav_id,
                $last_page_fetched_favorites, $continue) =
                $this->maintFavsFetch ($starting_fav_id, $fcount, $mpage, $older_favs_smode, $stop_page,
                $new_favs_to_add, $last_fav_id, $last_page_fetched_favorites, $continue);
                // }
            } else { // mode 0 -- archiving mode
                if (!$finished_first_fetch) {
                    $this->logger->logInfo("in 'first_archiving_fetch' clause", __METHOD__.','.__LINE__);
                    list($fcount, $last_fav_id, $last_page_fetched_favorites, $continue) =
                    $this->archivingFavsFetch($fcount, $last_fav_id, $last_page_fetched_favs_start, $continue);
                    $finished_first_fetch = true;
                } else {
                    list($fcount, $last_fav_id, $last_page_fetched_favorites, $continue) =
                    $this->archivingFavsFetch($fcount, $last_fav_id, $last_page_fetched_favorites, $continue);
                }
            }
        } // end while
        // update necessary instance fields
        $this->logger->logInfo("new_favs_to_add: $new_favs_to_add, fcount: $fcount", __METHOD__.','.__LINE__);
        $this->logger->logInfo("new 'last fav id': $last_fav_id", __METHOD__.','.__LINE__);

        $this->instance->last_favorite_id = $last_fav_id;
        $this->instance->last_page_fetched_favorites =$last_page_fetched_favorites;
        $this->instance->favorites_profile = $curr_favs_count;
        $this->logger->logUserSuccess("Saved $fcount new favorites.", __METHOD__.','.__LINE__);
        return true;
    }