/** * Fetch and save the instance users's friends. * * if is_archive_loaded * if ($this->instance->total_follows_in_system !== $this->user->friend_count) { * is_archive_loaded = false; * * if !is_archive_loaded * if followed_by_next_cursor is set * pageThroughFriends(followed_by_next_cursor) * else * pageThroughFriends() * * if is_archive_loaded * updateStaleFollows() * */ public function fetchFriends() { if (!isset($this->user)) { //Force-refresh instance user in data store $this->user = self::fetchUser($this->instance->network_user_id, 'Owner info', $this->instance->network_username, null, null, true); } $follow_dao = DAOFactory::getDAO('FollowDAO'); $this->instance->total_friends_in_system = $follow_dao->countTotalFriends($this->instance->network_user_id, 'instagram'); $this->logger->logUserInfo($this->instance->total_friends_in_system . " friends in system, " . $this->user->friend_count . " friends according to Instagram", __METHOD__ . ',' . __LINE__); if ($this->instance->total_friends_in_system < $this->user->friend_count) { $this->instance->is_archive_loaded_friends = false; } elseif ($this->instance->total_friends_in_system > $this->user->friend_count) { $this->instance->is_archive_loaded_friends = true; } else { $this->instance->is_archive_loaded_friends = true; } //If archive is not loaded, page through friends if (!$this->instance->is_archive_loaded_friends) { $this->logger->logInfo("Friend archive is not loaded, start paging", __METHOD__ . ',' . __LINE__); $this->pageThroughFriends($this->instance->follows_next_cursor); } //If archive is loaded, updateStaleFollows if ($this->instance->is_archive_loaded_friends) { $this->logger->logInfo("Friend archive loaded, start updating stale friendships", __METHOD__ . ',' . __LINE__); $this->updateStaleFollows(true); } }
/** * Fetch a save the posts and replies on a Facebook page. * @param int $pid Page ID */ public function fetchPagePostsAndReplies($pid) { $stream = FacebookGraphAPIAccessor::apiRequest('/'.$pid.'/posts', $this->access_token); if (isset($stream->data) && is_array($stream->data) && sizeof($stream->data > 0)) { $this->logger->logSuccess(sizeof($stream->data)." Facebook posts found for page ID $pid.", __METHOD__.','.__LINE__); $thinkup_data = $this->parseStream($stream, 'facebook page'); $posts = $thinkup_data["posts"]; $post_dao = DAOFactory::getDAO('PostDAO'); $added_posts = 0; foreach ($posts as $post) { if ($post['author_username']== "" && isset($post['author_user_id'])) { $commenter_object = $this->fetchUserInfo($post['author_user_id'], 'facebook', 'Facebook page comments'); if (isset($commenter_object)) { $post["author_username"] = $commenter_object->full_name; $post["author_fullname"] = $commenter_object->full_name; $post["author_avatar"] = $commenter_object->avatar; } } $added_posts = $added_posts + $post_dao->addPost($post); $this->logger->logInfo("Added post ID ".$post["post_id"]." on ".$post["network"]. " for ".$post["author_username"].":".$post["post_text"], __METHOD__.','.__LINE__); } $added_users = 0; $users = $thinkup_data["users"]; if (count($users) > 0) { foreach ($users as $user) { $user["post_count"] = $post_dao->getTotalPostsByUser($user['user_id'], $user['network']); $found_in = 'Facebook page stream'; $user_object = new User($user, $found_in); $user_dao = DAOFactory::getDAO('UserDAO'); $user_dao->updateUser($user_object); $added_users = $added_users + 1; } } if ($added_posts > 0 || $added_users > 0) { $this->logger->logUserSuccess($added_posts." post(s) added; ".$added_users." user(s) updated.", __METHOD__.','.__LINE__); } else { $this->logger->logUserInfo("No new page posts found.", __METHOD__.','.__LINE__); } } else { $this->logger->logInfo("No Facebook posts found for page ID $pid", __METHOD__.','.__LINE__); } }
/** * Fetch instance user's favorites since the last favorite stored. */ public function fetchInstanceUserFavorites() { if (!isset($this->user)) { $this->fetchInstanceUserInfo(); } $this->logger->logUserInfo("Checking for new favorites.", __METHOD__ . ',' . __LINE__); $last_fav_id = $this->instance->last_favorite_id; $this->logger->logInfo("Owner favs: " . $this->user->favorites_count . ", instance owner favs in system: " . $this->instance->owner_favs_in_system, __METHOD__ . ',' . __LINE__); $continue = true; while ($continue) { list($tweets, $http_status, $payload) = $this->getFavorites($last_fav_id); if ($http_status == 200) { if (sizeof($tweets) == 0) { // then done -- this should happen when we have run out of favs $this->logger->logInfo("It appears that we have run out of favorites to process", __METHOD__ . ',' . __LINE__); $continue = false; } else { $post_dao = DAOFactory::getDAO('FavoritePostDAO'); $fav_count = 0; foreach ($tweets as $tweet) { $tweet['network'] = 'twitter'; if ($post_dao->addFavorite($this->user->user_id, $tweet) > 0) { URLProcessor::processPostURLs($tweet['post_text'], $tweet['post_id'], 'twitter', $this->logger); $this->logger->logInfo("Found new fav: " . $tweet['post_id'], __METHOD__ . ',' . __LINE__); $fav_count++; $this->logger->logInfo("Fav count: {$fav_count}", __METHOD__ . ',' . __LINE__); $this->logger->logInfo("Added favorite: " . $tweet['post_id'], __METHOD__ . ',' . __LINE__); } else { // fav was already stored, so take no action. This could happen both because some // of the favs on the given page were processed last time, or because a separate process, // such as a UserStream process, is also watching for and storing favs. //$status_message = "Have already stored fav ". $tweet['post_id']; //$this->logger->logDebug($status_message, __METHOD__.','.__LINE__); } // keep track of the highest fav id we've encountered if ($tweet['post_id'] > $last_fav_id) { $last_fav_id = $tweet['post_id']; } } // end foreach } } else { $continue = false; } } }
/** * Fetch and save the posts and replies for the crawler's instance. This function will loop back through the * user's or pages archive of posts. * @return void * @throws APIOAuthException */ public function fetchPostsAndReplies() { $id = $this->instance->network_user_id; $network = $this->instance->network; // fetch user's friends $this->fetchAndStoreFriends(); $fetch_next_page = true; $current_page_number = 1; $next_api_request = 'https://graph.facebook.com/' . $id . '/feed?access_token=' . $this->access_token; //Cap crawl time for very busy pages with thousands of likes/comments $fetch_stop_time = time() + $this->max_crawl_time; //Determine 'since', datetime of oldest post in datastore $post_dao = DAOFactory::getDAO('PostDAO'); $since_post = $post_dao->getAllPosts($id, $network, 1, 1, true, 'pub_date', 'ASC'); $since = isset($since_post[0]) ? $since_post[0]->pub_date : 0; $since = strtotime($since) - 60 * 60 * 24; // last post minus one day, just to be safe $since < 0 ? $since = 0 : ($since = $since); while ($fetch_next_page) { $stream = FacebookGraphAPIAccessor::rawApiRequest($next_api_request, true); if (isset($stream->data) && is_array($stream->data) && sizeof($stream->data) > 0) { $this->logger->logInfo(sizeof($stream->data) . " Facebook posts found on page " . $current_page_number, __METHOD__ . ',' . __LINE__); $this->processStream($stream, $network, $current_page_number); if (isset($stream->paging->next)) { $next_api_request = $stream->paging->next . '&since=' . $since; $current_page_number++; } else { $fetch_next_page = false; } } elseif (isset($stream->error->type) && $stream->error->type == 'OAuthException') { throw new APIOAuthException($stream->error->message); } else { $this->logger->logInfo("No Facebook posts found for ID {$id}", __METHOD__ . ',' . __LINE__); $fetch_next_page = false; } if (time() > $fetch_stop_time) { $fetch_next_page = false; $this->logger->logUserInfo("Stopping this service user's crawl because it has exceeded max time of " . $this->max_crawl_time / 60 . " minute(s). ", __METHOD__ . ',' . __LINE__); } } }
/** * Expand Bit.ly links and recheck click count on any links less than 2 days old. * * @param str bitly api key * @param str bitly login name */ public function acquireBitlyClickStats($api_key, $bit_login) { $this->logger->setUsername(null); $api_accessor = new BitlyAPIAccessor($api_key, $bit_login); $bitly_urls = array('http://bit.ly/', 'http://bitly.com/', 'http://j.mp/'); foreach ($bitly_urls as $bitly_url) { if ($this->link_limit != 0) { //all short links first seen in the last 48 hours $bitly_links_to_update = $this->short_link_dao->getLinksToUpdate($bitly_url); if (count($bitly_links_to_update) > 0) { $this->logger->logUserInfo(count($bitly_links_to_update) . " {$bitly_url}" . " links to acquire click stats for.", __METHOD__ . ',' . __LINE__); } else { $this->logger->logUserInfo("There are no " . $bitly_url . " links to fetch click stats for.", __METHOD__ . ',', __LINE__); } $total_links = 0; $total_errors = 0; $total_updated = 0; foreach ($bitly_links_to_update as $link) { $this->logger->logInfo("Getting bit.ly click stats for " . ($total_updated + 1) . " of " . count($bitly_links_to_update) . " " . $bitly_url . " links (" . $link->short_url . ")", __METHOD__ . ',' . __LINE__); $link_data = $api_accessor->getBitlyLinkData($link->short_url); if ($link_data["clicks"] != '') { //save click total here $this->short_link_dao->saveClickCount($link->short_url, $link_data["clicks"]); // Save title to links table if ($link_data["title"] != '') { $this->link_dao->updateTitle($link->link_id, $link_data["title"]); } $total_links = $total_links + 1; $total_updated = $total_updated + 1; } elseif ($link_data["error"] != '') { $this->link_dao->saveExpansionError($link->short_url, $link_data["error"]); $total_errors = $total_errors + 1; $total_updated = $total_updated + 1; } } $this->logger->logUserSuccess($total_links . " " . $bitly_url . " link click stats acquired (" . $total_errors . " errors)", __METHOD__ . ',' . __LINE__); } } }
/** * This method, and the two supporting private methods 'maintFavsFetch' and 'archivingFavsFetch', provide the * primary crawler functionality for adding the user's favorites to the database. * For a given user, the process starts in 'archiving mode', by * working forwards from the last (oldest) page of tweets to the newest. This archiving crawl * is only done once. The crawler tries to do this all in one go, but if it exhausts the available API count, * it will continue where it left off in the next run. * Then, when page 1 is reached in archiving mode, the crawler goes into 'maintenance mode' and works * backwards from then on. It first pages back until * it has reached the last fav it previously processed. Then it searches back N more pages to catch any older * tweets that were fav'd out of chronological order, where N is determined by favs_older_pages option. * The bookkeeping for these two crawler stages is maintained in the in tu_instances entry for the user. * * Recently, the Twitter favorites API has developed some bugs that need to be worked around. The comments below * provide more detail, but in a nutshell, these methods can not currently use information from Twitter to * calculate loop termination (so a bit more work may be done than necessary), and do not currently remove un-fav'd * tweets from the database. Hopefully these API issues will be fixed by Twitter in future. */ public function fetchInstanceFavorites() { // first, check that we have the resources to do work if (!($this->api->available && $this->api->available_api_calls_for_crawler)) { $this->logger->logInfo("terminating fetchInstanceFavorites-- no API calls available", __METHOD__ . ',' . __LINE__); return true; } $status_message = ""; //@TODO Can we get this from API? $page_size = 20; // number of favs per page retrieved from the API call $this->logger->logUserInfo("Checking for new favorites.", __METHOD__ . ',' . __LINE__); $last_favorites_count = $this->instance->favorites_profile; $this->logger->logInfo("last favs count: {$last_favorites_count}", __METHOD__ . ',' . __LINE__); $last_page_fetched_favorites = $this->instance->last_page_fetched_favorites; $last_fav_id = $this->instance->last_favorite_id; $curr_favs_count = $this->user->favorites_count; $this->logger->logInfo("curr favs count: {$curr_favs_count}", __METHOD__ . ',' . __LINE__); $last_page_of_favs = round($this->api->archive_limit / $page_size); // under normal circs the latter clause below should never hold, but due to a previously-existing // bug that could set a negative last_page_fetched_favorites value in the db in some cases, // it is necessary for recovery. if ($last_page_fetched_favorites == "" || $last_page_fetched_favorites < 0) { $last_page_fetched_favorites = 0; } $this->logger->logInfo("got last_page_fetched_favorites: {$last_page_fetched_favorites}", __METHOD__ . ',' . __LINE__); if ($last_fav_id == "") { $last_fav_id = 0; } // the owner favs count, from twitter, is currently unreliable and may be less than the actual number of // favs, by a large margin. So, we still go ahead and calculate the number of 'missing' tweets based on // this info, but currently do not use it for fetch loop termination. $this->logger->logInfo("owner favs: " . $this->user->favorites_count . ", instance owner favs in system: " . $this->instance->owner_favs_in_system, __METHOD__ . ',' . __LINE__); $favs_missing = $this->user->favorites_count - $this->instance->owner_favs_in_system; $this->logger->logInfo("favs missing: {$favs_missing}", __METHOD__ . ',' . __LINE__); // figure out if we're in 'archiving' or 'maintenance' mode, via # of last_page_fetched_favorites $mode = 0; // default is archving/first-fetch if ($last_page_fetched_favorites == 1) { $mode = 1; // we are in maint. mode $new_favs_to_add = $favs_missing; $this->logger->logInfo("new favs to add/missing: {$new_favs_to_add}", __METHOD__ . ',' . __LINE__); $mpage = 1; $starting_fav_id = $last_fav_id; } else { // we are in archiving mode. $new_favs_to_add = $curr_favs_count - $last_favorites_count; // twitter profile information is not always consistent, so ensure that this value is not negative if ($new_favs_to_add < 0) { $new_favs_to_add == 0; } $this->logger->logInfo("new favs to add: {$new_favs_to_add}", __METHOD__ . ',' . __LINE__); // figure out start page based on where we left off last time, and how many favs added since then $extra_pages = ceil($new_favs_to_add / $page_size); $this->logger->logInfo("extra pages: {$extra_pages}", __METHOD__ . ',' . __LINE__); $finished_first_fetch = false; if ($last_page_fetched_favorites == 0) { // if at initial starting fetch (first time favs ever crawled) if ($extra_pages == 0) { $extra_pages = 1; // always check at least one page on initial fetch } $last_page_fetched_favs_start = $extra_pages + 1; } else { $last_page_fetched_favs_start = $last_page_fetched_favorites + $extra_pages; } if ($last_page_fetched_favs_start > $last_page_of_favs) { $last_page_fetched_favs_start = $last_page_of_favs + 1; } } $status_message = "total last favs count: {$last_favorites_count}" . ", last page fetched: {$last_page_fetched_favorites}, last fav id: {$last_fav_id}"; $this->logger->logInfo($status_message, __METHOD__ . ',' . __LINE__); $this->logger->logInfo("current favs count: {$curr_favs_count}" . ", new favs to add: {$new_favs_to_add}, last page of favs: {$last_page_of_favs}, mode: {$mode}", __METHOD__ . ',' . __LINE__); $continue = true; $fcount = 0; $older_favs_smode = false; $stop_page = 0; $status_message = "in fetchInstanceFavorites: API available: " . $this->api->available . ", avail for crawler: " . $this->api->available_api_calls_for_crawler; $this->logger->logInfo($status_message, __METHOD__ . ',' . __LINE__); while ($this->api->available && $this->api->available_api_calls_for_crawler > 0 && $continue) { try { if ($mode != 0) { // in maintenance, not archiving mode list($fcount, $mpage, $older_favs_smode, $stop_page, $new_favs_to_add, $last_fav_id, $last_page_fetched_favorites, $continue) = $this->maintFavsFetch($starting_fav_id, $fcount, $mpage, $older_favs_smode, $stop_page, $new_favs_to_add, $last_fav_id, $last_page_fetched_favorites, $continue); // } } else { // mode 0 -- archiving mode if (!$finished_first_fetch) { list($fcount, $last_fav_id, $last_page_fetched_favorites, $continue) = $this->archivingFavsFetch($fcount, $last_fav_id, $last_page_fetched_favs_start, $continue); $finished_first_fetch = true; } else { list($fcount, $last_fav_id, $last_page_fetched_favorites, $continue) = $this->archivingFavsFetch($fcount, $last_fav_id, $last_page_fetched_favorites, $continue); } } } catch (APICallLimitExceededException $e) { break; } } // end while // update necessary instance fields $this->logger->logInfo("new_favs_to_add: {$new_favs_to_add}, fcount: {$fcount}", __METHOD__ . ',' . __LINE__); $this->logger->logInfo("new 'last fav id': {$last_fav_id}", __METHOD__ . ',' . __LINE__); $this->instance->last_favorite_id = $last_fav_id; $this->instance->last_page_fetched_favorites = $last_page_fetched_favorites; $this->instance->favorites_profile = $curr_favs_count; $this->logger->logUserSuccess("Saved {$fcount} new favorites.", __METHOD__ . ',' . __LINE__); return true; }
/** * Fetch and save the posts and replies for the crawler's instance. This function will loop back through the * user's or pages archive of posts. * @return void * @throws APIOAuthException */ public function fetchPostsAndReplies() { $id = $this->instance->network_user_id; $network = $this->instance->network; $fetch_next_page = true; $current_page_number = 1; $next_api_request = $id . '/feed'; $fields = self::$feed_fields; //Cap crawl time for very busy pages with thousands of likes/comments $fetch_stop_time = time() + $this->max_crawl_time; $api_request_params = null; $use_full_api_url = false; $dig_into_archives = false; while ($fetch_next_page) { if (!$use_full_api_url) { $stream = FacebookGraphAPIAccessor::apiRequest($next_api_request, $this->access_token, $api_request_params, $fields); $api_request_params = null; } else { //Use full paging URL $stream = FacebookGraphAPIAccessor::apiRequestFullURL($next_api_request, $this->access_token); } if (isset($stream->data) && is_array($stream->data) && sizeof($stream->data) > 0) { $this->logger->logInfo(sizeof($stream->data) . " Facebook posts found on page " . $current_page_number, __METHOD__ . ',' . __LINE__); $total_added_posts = $this->processStream($stream, $network, $current_page_number); if ($total_added_posts == 0) { //No new posts were found, try going back into the archives if (!$dig_into_archives) { $dig_into_archives = true; //Determine 'since', datetime of oldest post in datastore $post_dao = DAOFactory::getDAO('PostDAO'); $since_post = $post_dao->getAllPosts($id, $network, 1, 1, true, 'pub_date', 'ASC'); $since = isset($since_post[0]) ? $since_post[0]->pub_date : 0; $since = strtotime($since); $this->logger->logInfo("No Facebook posts found for {$id} here, digging into archives since " . $since_post[0]->pub_date . " strtotime " . $since, __METHOD__ . ',' . __LINE__); $api_request_params = array('since' => $since); $use_full_api_url = false; $next_api_request = $id . '/feed'; } else { if (isset($stream->paging->next)) { $next_api_request = $stream->paging->next; $use_full_api_url = true; //DEBUG $this->logger->logInfo("Dug into archives, next page API request is " . $next_api_request, __METHOD__ . ',' . __LINE__); $current_page_number++; } else { $fetch_next_page = false; } } } else { if (isset($stream->paging->next)) { $next_api_request = $stream->paging->next; $use_full_api_url = true; //DEBUG $this->logger->logInfo("Next page API request is " . $next_api_request, __METHOD__ . ',' . __LINE__); $current_page_number++; } else { $fetch_next_page = false; } } } elseif (isset($stream->error->type) && $stream->error->type == 'OAuthException') { throw new APIOAuthException($stream->error->message); } else { $this->logger->logInfo("No Facebook posts found for ID {$id}", __METHOD__ . ',' . __LINE__); $fetch_next_page = false; } if (time() > $fetch_stop_time) { $fetch_next_page = false; $this->logger->logUserInfo("Stopping this service user's crawl because it has exceeded max time of " . $this->max_crawl_time / 60 . " minute(s). ", __METHOD__ . ',' . __LINE__); } } }
/** * Fetch and save the posts and replies for the crawler's instance. This function will loop back through the * user's or pages archive of posts. */ public function fetchPostsAndReplies() { $plugin_dao = DAOFactory::getDAO('PluginDAO'); $plugin_id = $plugin_dao->getPluginId('instagram'); $namespace = OptionDAO::PLUGIN_OPTIONS . '-' . $plugin_id; $id = $this->instance->network_user_id; $option_dao = DAOFactory::getDAO('OptionDAO'); $network = $this->instance->network; //Checks if last friends update is over 2 days ago and runs storeFriends if it is. $friends_last_updated = $option_dao->getOptionByName($namespace, 'last_crawled_friends'); $friends_last_updated_check = microtime(true) - 172800; if ($friends_last_updated == NULL) { $this->storeFriends(); $option_dao->insertOption($namespace, 'last_crawled_friends', microtime(true)); } elseif ($friends_last_updated->option_value < $friends_last_updated_check) { $this->storeFriends(); $option_dao->updateOptionByName($namespace, 'last_crawled_friends', microtime(true)); } $fetch_next_page = true; $current_page_number = 1; $api_param = array(); if ($this->instance->total_posts_in_system != 0) { $last_crawl = $this->instance->crawler_last_run; $crawl_less_week = date($last_crawl, strtotime("-1 week")); $unix_less_week = strtotime($crawl_less_week); $api_param = array('min_timestamp' => $unix_less_week, 'count' => 20); } else { $api_param = array('count' => 20); } $this->logger->logUserInfo("About to request media", __METHOD__ . ',' . __LINE__); $posts = InstagramAPIAccessor::apiRequest('media', $id, $this->access_token, $api_param); $this->logger->logUserInfo("Media requested", __METHOD__ . ',' . __LINE__); //Cap crawl time for very busy pages with thousands of likes/comments $fetch_stop_time = time() + $this->max_crawl_time; //Determine 'since', datetime of oldest post in datastore $post_dao = DAOFactory::getDAO('PostDAO'); $since_post = $post_dao->getAllPosts($id, $network, 1, 1, true, 'pub_date', 'ASC'); $since = isset($since_post[0]) ? $since_post[0]->pub_date : 0; $since = strtotime($since) - 60 * 60 * 24; // last post minus one day, just to be safe if ($since < 0) { $since = 0; } else { $since = $since; } while ($fetch_next_page) { if ($posts->count() > 0) { $this->logger->logInfo(sizeof($stream->data) . " Instagram posts found on page " . $current_page_number, __METHOD__ . ',' . __LINE__); $this->processPosts($posts, $network, $current_page_number); if ($posts->getNext() != null) { $api_param['max_id'] = $posts->getNext(); $posts = InstagramAPIaccessor::apiRequest('media', $id, $this->access_token, $api_param); $current_page_number++; } else { $fetch_next_page = false; } } else { $this->logger->logInfo("No Instagram posts found for ID {$id}", __METHOD__ . ',' . __LINE__); $fetch_next_page = false; } if (time() > $fetch_stop_time) { $fetch_next_page = false; $this->logger->logUserInfo("Stopping this service user's crawl because it has exceeded max time of " . $this->max_crawl_time / 60 . " minute(s). ", __METHOD__ . ',' . __LINE__); } } }
/** * Collects and stores information about the users videos from the YouTube APIs * Currently collects and stores: * - Basic video information such as title, author, description and location the video was shot in (if available) * - Replies to the video * -- This uses the YouTube V2 API due to the V3 API currently not supporting replies * - All time counts for likes, dislikes, views, average view duration, average view percentage, favorites added, * favorites removed, shares, subscribers gained and subscribers lost * -- The totals for these are stored in the videos table, a history of these totals is stored in the * count_history table under a type of [metric]_all_time and date of todays date * -- A record of these metrics for indivdual days is also saved in the count_history table under a type of * [metric] and date of the day the metric represents usually two days ago due to a delay in the availability * of data from the Analytics API * @return null */ public function fetchInstanceUserVideos() { $video_dao = DAOFactory::getDAO('VideoDAO'); $user_dao = DAOFactory::getDAO('UserDAO'); $post_dao = DAOFactory::getDAO('PostDAO'); $count_history_dao = DAOFactory::getDAO('CountHistoryDAO'); $instance_dao = DAOFactory::getDAO('InstanceDAO'); // Get the users upload playlist ID $fields_for_ids = array('part' => 'contentDetails,statistics', 'mine' => 'true'); $various_ids = $this->youtube_api_accessor->apiRequest('channels', $this->access_token, $fields_for_ids); $upload_id = $various_ids->items[0]->contentDetails->relatedPlaylists->uploads; // Also get their channel ID as we'll need it later on $channel_id = $various_ids->items[0]->id; // There are some required attributes about the author that YouTube doesn't return for the videos so we need // to query the database for them $author_details = $user_dao->getDetails($this->instance->network_user_id, 'youtube'); $user_id = $this->instance->network_user_id; // Update the users subscriber count $subscriber_count = $various_ids->items[0]->statistics->subscriberCount; $author_details->follower_count = $subscriber_count; $user_dao->updateUser($author_details); $count_history_dao->insert($user_id, 'youtube', $subscriber_count, null, 'subscriber_count'); // Calculate the time at which we should stop fetching videos $end_time = time() + $this->max_crawl_time; // Keep track of if we finished the crawl early due to timing out $had_to_finish_early = false; // Check if we already loaded all the old posts for this user $archive_loaded = $instance->is_archive_loaded_posts; // If the archive isn't loaded yet keep track of how many times we've tried to load it if (!$archive_loaded) { $attempts = $count_history_dao->getLatestCountByNetworkUserIDAndType($user_id, 'youtube', 'youtube_archive_attempts'); if ($attempts == null) { // If this is the first crawler run $attempts['count'] = 0; } $attempts['count']++; $count_history_dao->insert($user_id, 'youtube', $attempts['count'], null, 'youtube_archive_attempts', null); } // Now page through their videos collecting the data $videos_fields = array('part' => 'snippet', 'maxResults' => '25', 'playlistId' => $upload_id, 'pageToken' => null); // We may get multiple pages do { // This is a page of IDs of videos the user has uploaded $user_videos = $this->youtube_api_accessor->apiRequest('playlistItems', $this->access_token, $videos_fields); // For each video store the relevant details about it foreach ($user_videos->items as $video) { // If we've hit the max crawl time stop if (time() >= $end_time) { $this->logger->logUserInfo("Stopping this service users crawl because it has exceeded max time of " . $this->max_crawl_time / 60 . " minute(s). ", __METHOD__ . ',' . __LINE__); $had_to_finish_early = true; break 2; } $video_id = $video->snippet->resourceId->videoId; // Get the title, description, likes, dislikes, views, and details about where // the video was taken from the data API $video_fields = array('id' => $video_id, 'part' => 'statistics,id,snippet,recordingDetails,status'); $video_details = $this->youtube_api_accessor->apiRequest('videos', $this->access_token, $video_fields); $item = $video_details->items[0]; // Check we haven't used up our quota if (isset($video_details->error)) { $this->logger->logError('Error querying YouTube Data API V3 ', __METHOD__ . ',' . __LINE__); break; } $video_attributes['post_text'] = $item->snippet->title; $video_attributes['description'] = $item->snippet->description; $video_attributes['likes'] = $item->statistics->likeCount; $video_attributes['dislikes'] = $item->statistics->dislikeCount; $video_attributes['views'] = $item->statistics->viewCount; // Keep track of these all time counts $count_history_dao->insert($user_id, 'youtube', $video_attributes['likes'], $video_id, 'likes_all_time'); $count_history_dao->insert($user_id, 'youtube', $video_attributes['dislikes'], $video_id, 'dislikes_all_time'); $count_history_dao->insert($user_id, 'youtube', $video_attributes['views'], $video_id, 'views_all_time'); $video_attributes['pub_date'] = $item->snippet->publishedAt; $video_attributes['post_id'] = $item->id; $video_attributes['location'] = $item->recordingDetails->locationDescription; $video_attributes['place'] = $item->recordingDetails->locationDescription; if (isset($item->recordingDetails->latitude)) { $video_attributes['geo'] = $item->recordingDetails->latitude . "," . $item->recordingDetails->longitude; } $video_attributes['is_protected'] = self::determinePrivacyStatus($item->status->privacyStatus); $today = date('Y-m-d'); $upload_date = substr($item->snippet->publishedAt, 0, 10); // Get the favourites added, favourites removed, shares, subscribers gained, subscribers lost // estimated minuites watched, average view duration, average view percentage $analytics_fields = array('ids' => 'channel==' . $channel_id, 'start-date' => $upload_date, 'end-date' => $today, 'metrics' => 'favoritesAdded,favoritesRemoved,shares,subscribersGained,subscribersLost,' . 'estimatedMinutesWatched,averageViewDuration,averageViewPercentage,views,likes,dislikes', 'filters' => 'video==' . $video_id); $video_analytics_details = $this->youtube_analytics_api_accessor->apiRequest('reports', $this->access_token, $analytics_fields); // Check we haven't used up our quota if (isset($video_analytics_details->error)) { $this->logger->logError('Error querying YouTube Analytics API', __METHOD__ . ',' . __LINE__); break; } $analytics_item = $video_analytics_details->rows[0]; // If the video is new we may not get any of these values back, but they can't be null if (isset($analytics_item)) { $video_attributes['favorites_added'] = $analytics_item[0]; $video_attributes['favorites_removed'] = $analytics_item[1]; $video_attributes['shares'] = $analytics_item[2]; $video_attributes['subscribers_gained'] = $analytics_item[3]; $video_attributes['subscribers_lost'] = $analytics_item[4]; $video_attributes['minutes_watched'] = $analytics_item[5]; $video_attributes['average_view_duration'] = $analytics_item[6]; $video_attributes['average_view_percentage'] = $analytics_item[7]; // Keep track of these all time counts $count_history_dao->insert($user_id, 'youtube', $analytics_item[0], $video_id, 'favorites_added_all_time'); $count_history_dao->insert($user_id, 'youtube', $analytics_item[1], $video_id, 'favorites_removed_all_time'); $count_history_dao->insert($user_id, 'youtube', $analytics_item[2], $video_id, 'shares_all_time'); $count_history_dao->insert($user_id, 'youtube', $analytics_item[3], $video_id, 'subscribers_gained_all_time'); $count_history_dao->insert($user_id, 'youtube', $analytics_item[4], $video_id, 'subscribers_lost_all_time'); $count_history_dao->insert($user_id, 'youtube', $analytics_item[5], $video_id, 'minutes_watched_all_time'); $count_history_dao->insert($user_id, 'youtube', $analytics_item[6], $video_id, 'average_view_duration_all_time'); $count_history_dao->insert($user_id, 'youtube', $analytics_item[7], $video_id, 'average_view_percentage_all_time'); } else { // So set them the 0 $video_attributes['favorites_added'] = 0; $video_attributes['favorites_removed'] = 0; $video_attributes['shares'] = 0; $video_attributes['subscribers_gained'] = 0; $video_attributes['subscribers_lost'] = 0; $video_attributes['minutes_watched'] = 0; $video_attributes['average_view_duration'] = 0; $video_attributes['average_view_percentage'] = 0; } $video_attributes['author_user_id'] = $this->instance->network_user_id; $video_attributes['author_username'] = $this->instance->network_username; $video_attributes['author_fullname'] = $author_details->full_name; $video_attributes['author_avatar'] = $author_details->avatar; $video_attributes['source'] = ''; $video_attributes['network'] = 'youtube'; $video_dao->addVideo($video_attributes); // Now collect per day count data for 2 days ago (testing has shown analytics data is delayed by 2 days) $two_days_ago = date('Y-m-d', strtotime("-2 day", strtotime($today))); $analytics_fields['start-date'] = $two_days_ago; $analytics_fields['end-date'] = $two_days_ago; $analytics_today_details = $this->youtube_analytics_api_accessor->apiRequest('reports', $this->access_token, $analytics_fields); // Check we haven't used up our quota if (isset($analytics_today_details->error)) { $this->logger->logError('Error querying YouTube Analytics API', __METHOD__ . ',' . __LINE__); break; } $todays_analytics = $analytics_today_details->rows[0]; // Check we got data and if not skip this part if (isset($todays_analytics)) { $count_history_dao->insert($user_id, 'youtube', $todays_analytics[0], $video_id, 'favorites_added', $two_days_ago); $count_history_dao->insert($user_id, 'youtube', $todays_analytics[1], $video_id, 'favorites_removed', $two_days_ago); $count_history_dao->insert($user_id, 'youtube', $todays_analytics[2], $video_id, 'shares', $two_days_ago); $count_history_dao->insert($user_id, 'youtube', $todays_analytics[3], $video_id, 'subscribers_gained', $two_days_ago); $count_history_dao->insert($user_id, 'youtube', $todays_analytics[4], $video_id, 'subscribers_lost', $two_days_ago); $count_history_dao->insert($user_id, 'youtube', $todays_analytics[5], $video_id, 'minutes_watched', $two_days_ago); $count_history_dao->insert($user_id, 'youtube', $todays_analytics[6], $video_id, 'average_view_duration', $two_days_ago); $count_history_dao->insert($user_id, 'youtube', $todays_analytics[7], $video_id, 'average_view_percentage', $two_days_ago); $count_history_dao->insert($user_id, 'youtube', $todays_analytics[8], $video_id, 'views', $two_days_ago); $count_history_dao->insert($user_id, 'youtube', $todays_analytics[9], $video_id, 'likes', $two_days_ago); $count_history_dao->insert($user_id, 'youtube', $todays_analytics[10], $video_id, 'dislikes', $two_days_ago); } // Check to see how many comments we already have for this video and if there are no new ones skip // comment collection as it takes a long time. $video_in_db = $video_dao->getVideoByID($video_id, 'youtube'); $comments_in_db = $video_in_db->reply_count_cache; if (!isset($comments_in_db)) { $comments_in_db = 0; } $api_comments = $item->statistics->commentCount; $comments_collected = 0; // if this video has any new comments capture those if ($api_comments > 0 && $api_comments > $comments_in_db) { // Request the first page of comments for this video $comments_fields = array('alt' => 'json'); if (isset($this->developer_key)) { $comments_fields['key'] = $this->developer_key; } $comments = $this->youtube_api_v2_accessor->apiRequest('videos/' . $video_id . '/comments', $comments_fields); // Check we haven't used up our quota if (isset($comments->errors)) { $this->logger->logError('Error querying YouTube Data API V2 ', __METHOD__ . ',' . __LINE__); break; } do { // Iterate through each comment and store the details foreach ($comments->feed->entry as $comment) { // We may have only needed to collect a few new comments so abort if we have everything if ($api_comments == $comments_in_db) { break 2; } // If the user has specified a limit on the number of comments per video to collect each // crawl check we haven't exceeded it if (isset($this->maximum_comments) && $comments_collected >= $this->maximum_comments) { $this->logger->logUserInfo("Stopping collection of comments for video due to reaching " . "limit of " . $this->maximum_comments . " comments.", __METHOD__ . ',' . __LINE__); break 2; } // We may spend a long time collecting comments so also check here if we've exceed the max // time specified by the user if (time() >= $end_time) { $this->logger->logUserInfo("Stopping this service users crawl because it has exceeded " . "max time of " . $this->max_crawl_time / 60 . " minute(s). ", __METHOD__ . ',' . __LINE__); $had_to_finish_early = true; break 4; } // The id is returned in the XML as part of a long URL, we only want the last part of that // URL $id_string = explode('/', $comment->id->{'$t'}); // This will be the last element of id_string $comment_store['post_id'] = $id_string[sizeof($id_string) - 1]; // The post text is the comment they made // Remove byte order markers from the comment text from: // http://stackoverflow.com/questions/3255993/how-do-i-remove-i-from-the-beginning // -of-a-file#comment9330944_3256183 $comment_store['post_text'] = preg_replace('/\\x{EF}\\x{BB}\\x{BF}/', '', $comment->content->{'$t'}); // The author username is the users G+ displayname which we need to query for // To get the G+ ID of this commentor we need to vist their youtube profile page, the ID // needed to get to this users page is the last element of the author URI $user_id_string = explode('/', $comment->author[0]->uri->{'$t'}); $name = $this->youtube_api_v2_accessor->apiRequest('users/' . $user_id_string[sizeof($user_id_string) - 1], $comments_fields); $gplus_id = $name->entry->{'yt$googlePlusUserId'}->{'$t'}; // // Now we have their G+ ID we can get their details from the G+ API $gplus_fields = array('fields' => 'displayName,id,image,tagline,verified'); $user_details = $this->google_plus_api_accessor->apiRequest('people/' . $gplus_id, $this->access_token, $gplus_fields); // Sometimes G+ says the ID is invalid or the user doesn't have a G+ ID if ($user_details->error->code == '404' || $gplus_id == '') { // Use V2 of the YouTube api to get their details $comment_store['author_username'] = $name->entry->{'yt$username'}->{'$t'}; $comment_store['author_fullname'] = $name->entry->author[0]->name->{'$t'}; $comment_store["author_avatar"] = $name->entry->{'media$thumbnail'}->url; // In this case the user id is their YouTube user ID $comment_store['author_user_id'] = $user_id_string[sizeof($user_id_string) - 1]; self::fetchUserFromYouTube($user_id_string[sizeof($user_id_string) - 1], 'youtube_crawler'); // If we still didn't get these details we can't store this comment if ($comment_store['author_username'] == null || $comment_store['author_fullname'] == null || $comment_store["author_avatar"] == null) { continue; } } elseif (isset($user_details->error)) { //Check we haven't exceed the G+ API quota $this->logger->logError('Error querying Google Plus API ', __METHOD__ . ',' . __LINE__); break; } else { $comment_store['author_username'] = $user_details->displayName; $comment_store['author_fullname'] = $user_details->displayName; $comment_store["author_avatar"] = $user_details->image->url; // The author user id is their G+ ID $comment_store['author_user_id'] = $gplus_id; // Make sure we have this commentor in the database self::fetchUser($gplus_id, 'youtube crawler'); } // The date they posted the comment $comment_store['pub_date'] = substr($comment->published->{'$t'}, 0, 10) . " " . substr($comment->published->{'$t'}, 11, 8); // Source of the comment $comment_store['source'] = ""; // Comments can not be private $comment_store['is_protected'] = false; // Set the network to youtube $comment_store['network'] = 'youtube'; // The ID of the author of the video $comment_store['in_reply_to_user_id'] = $this->instance->network_user_id; // The ID of the video this comment is a reply to $comment_store['in_reply_to_post_id'] = $video_id; $insert_id = $post_dao->addPost($comment_store); // If the insert id is null and were not going back to collect the whole archive // we've already captured comments from this point so move on if ($insert_id == null && $archive_loaded) { break 2; } $comments_in_db++; $comments_collected++; } $test = self::determineIfMoreCommentsExist($comments, $video_id); // If there is another page of comments make a request for them if ($test['next']) { $comments = $this->youtube_api_v2_accessor->basicApiRequest($test['url']); // Check we haven't used up our quota if (isset($comments->errors)) { $this->logger->logError('Error querying YouTube Data API V2 ', __METHOD__ . ',' . __LINE__); break; } elseif ($comments == null) { // If the comments come back as null its because we've been making too many requests too // quickly The YouTube api doesn't return valid JSON telling us this though so // json_decode returns null so back off for 30 seconds and then try again $error_message = "Querying the YouTube API too often waiting for 30 seconds, to "; $error_message .= "prevent this delay add a developer key."; $this->logger->logError($error_message, __METHOD__ . ',' . __LINE__); sleep(30); $comments = $this->youtube_api_v2_accessor->basicApiRequest($test['url']); } } } while ($test['next']); } // If we have another page of videos then get the token for the page if (isset($user_videos->nextPageToken)) { $videos_fields['pageToken'] = $user_videos->nextPageToken; } } // If we have another page of videos and haven't loaded all this users video yet keep going // if we have loaded all this users videos then stop after 1 page (50 videos) } while (isset($user_videos->nextPageToken) && !$archive_loaded); // If we didn't have to finish the crawl early due to timing out we have collected all this users videos or // we have tried more than 20 times stop trying to go back and load the post archive if (!$had_to_finish_early || $attempts >= 20) { $instance_dao->setPostArchiveLoaded($user_id, 'youtube'); } }
/** * Convert parsed JSON of a profile or page's posts into ThinkUp posts and users * @param Object $stream * @param str $source The network for the post; by default 'facebook' */ private function processStream($stream, $network) { $thinkup_posts = array(); $total_added_posts = 0; $thinkup_users = array(); $total_added_users = 0; $thinkup_links = array(); $total_links_added = 0; $thinkup_likes = array(); $total_added_likes = 0; $profile = null; $post_dao = DAOFactory::getDAO('PostDAO'); $must_process_likes = true; $must_process_comments = true; foreach ($stream->data as $p) { $post_id = explode("_", $p->id); $post_id = $post_id[1]; if ($profile == null) { $profile = $this->fetchUserInfo($p->from->id, $network, 'Post stream'); } //assume profile comments are private and page posts are public $is_protected = $network == 'facebook' ? 1 : 0; //get likes count $likes_count = 0; if (isset($p->likes)) { if (is_int($p->likes)) { $likes_count = $p->likes; } elseif (isset($p->likes->count) && is_int($p->likes->count)) { $likes_count = $p->likes->count; } } //Figure out if we have to process likes and comments $post_in_storage = $post_dao->getPost($post_id, $network); if (isset($post_in_storage)) { if ($post_in_storage->favlike_count_cache >= $likes_count) { $must_process_likes = false; $this->logger->logInfo("Already have " . $likes_count . " likes for post ID " . $post_id . "; Skipping like processing this crawler run", __METHOD__ . ',' . __LINE__); } if (isset($p->comments->count)) { if ($post_in_storage->reply_count_cache >= $p->comments->count) { $must_process_comments = false; $this->logger->logInfo("Already have " . $p->comments->count . " comments for post ID " . $post_id . "; Skipping comments processing", __METHOD__ . ',' . __LINE__); } } } if (isset($profile) && !isset($post_in_storage)) { $posts_to_process = array("post_id" => $post_id, "author_username" => $profile->username, "author_fullname" => $profile->username, "author_avatar" => $profile->avatar, "author_user_id" => $p->from->id, "post_text" => isset($p->message) ? $p->message : '', "pub_date" => $p->created_time, "favlike_count_cache" => $likes_count, "in_reply_to_user_id" => '', "in_reply_to_post_id" => '', "source" => '', 'network' => $network, 'is_protected' => $is_protected, 'location' => $profile->location); array_push($thinkup_posts, $posts_to_process); $total_added_posts = $total_added_posts + $this->storePostsAndAuthors($thinkup_posts, "Owner stream"); //free up memory $thinkup_posts = array(); if (isset($p->source) || isset($p->link)) { // there's a link to store $link_url = isset($p->source) ? $p->source : $p->link; $link = new Link(array("url" => $link_url, "expanded_url" => $link_url, "image_src" => isset($p->picture) ? $p->picture : '', "caption" => isset($p->caption) ? $p->caption : '', "description" => isset($p->description) ? $p->description : '', "title" => isset($p->name) ? $p->name : '', "network" => $network, "post_id" => $post_id)); array_push($thinkup_links, $link); } $total_links_addded = $total_links_added + $this->storeLinks($thinkup_links); if ($total_links_added > 0) { $this->logger->logUserSuccess("Collected {$total_links_added} new links", __METHOD__ . ',' . __LINE__); } //free up memory $thinkup_links = array(); } if ($must_process_comments) { if (isset($p->comments)) { $comments_captured = 0; if (isset($p->comments->data)) { $post_comments = $p->comments->data; $post_comments_count = isset($post_comments) ? sizeof($post_comments) : 0; if (is_array($post_comments) && sizeof($post_comments) > 0) { foreach ($post_comments as $c) { if (isset($c->from)) { $comment_id = explode("_", $c->id); $comment_id = $comment_id[2]; //Get posts $posts_to_process = array("post_id" => $comment_id, "author_username" => $c->from->name, "author_fullname" => $c->from->name, "author_avatar" => 'https://graph.facebook.com/' . $c->from->id . '/picture', "author_user_id" => $c->from->id, "post_text" => $c->message, "pub_date" => $c->created_time, "in_reply_to_user_id" => $profile->user_id, "in_reply_to_post_id" => $post_id, "source" => '', 'network' => $network, 'is_protected' => $is_protected, 'location' => ''); array_push($thinkup_posts, $posts_to_process); $comments_captured = $comments_captured + 1; } } } } $total_added_posts = $total_added_posts + $this->storePostsAndAuthors($thinkup_posts, "Post stream comments"); //free up memory $thinkup_posts = array(); // collapsed comment thread if (isset($p->comments->count) && $p->comments->count > $comments_captured) { $api_call = 'https://graph.facebook.com/' . $p->from->id . '_' . $post_id . '/comments?access_token=' . $this->access_token; do { $comments_stream = FacebookGraphAPIAccessor::rawApiRequest($api_call); if (isset($comments_stream) && is_array($comments_stream->data)) { foreach ($comments_stream->data as $c) { if (isset($c->from)) { $comment_id = explode("_", $c->id); $comment_id = $comment_id[sizeof($comment_id) - 1]; //Get posts $posts_to_process = array("post_id" => $comment_id, "author_username" => $c->from->name, "author_fullname" => $c->from->name, "author_avatar" => 'https://graph.facebook.com/' . $c->from->id . '/picture', "author_user_id" => $c->from->id, "post_text" => $c->message, "pub_date" => $c->created_time, "in_reply_to_user_id" => $profile->user_id, "in_reply_to_post_id" => $post_id, "source" => '', 'network' => $network, 'is_protected' => $is_protected, 'location' => ''); array_push($thinkup_posts, $posts_to_process); } } $total_added_posts = $total_added_posts + $this->storePostsAndAuthors($thinkup_posts, "Posts stream comments collapsed"); //free up memory $thinkup_posts = array(); if (isset($comments_stream->paging->next)) { $api_call = str_replace('\\u00257C', '|', $comments_stream->paging->next); } } else { // no comments (pun intended) break; } } while (isset($comments_stream->paging->next)); } } } //process "likes" if ($must_process_likes) { if (isset($p->likes)) { $likes_captured = 0; if (isset($p->likes->data)) { $post_likes = $p->likes->data; $post_likes_count = isset($post_likes) ? sizeof($post_likes) : 0; if (is_array($post_likes) && sizeof($post_likes) > 0) { foreach ($post_likes as $l) { if (isset($l->name) && isset($l->id)) { //Get users $ttu = array("user_name" => $l->name, "full_name" => $l->name, "user_id" => $l->id, "avatar" => 'https://graph.facebook.com/' . $l->id . '/picture', "location" => '', "description" => '', "url" => '', "is_protected" => 1, "follower_count" => 0, "post_count" => 0, "joined" => '', "found_in" => "Likes", "network" => 'facebook'); //Users are always set to network=facebook array_push($thinkup_users, $ttu); $fav_to_add = array("favoriter_id" => $l->id, "network" => $network, "author_user_id" => $profile->user_id, "post_id" => $post_id); array_push($thinkup_likes, $fav_to_add); $likes_captured = $likes_captured + 1; } } } } $total_added_users = $total_added_users + $this->storeUsers($thinkup_users, "Likes"); $total_added_likes = $total_added_likes + $this->storeLikes($thinkup_likes); //free up memory $thinkup_users = array(); $thinkup_likes = array(); // collapsed likes if (isset($p->likes->count) && $p->likes->count > $likes_captured) { $api_call = 'https://graph.facebook.com/' . $p->from->id . '_' . $post_id . '/likes?access_token=' . $this->access_token; do { $likes_stream = FacebookGraphAPIAccessor::rawApiRequest($api_call); if (isset($likes_stream) && is_array($likes_stream->data)) { foreach ($likes_stream->data as $l) { if (isset($l->name) && isset($l->id)) { //Get users $ttu = array("user_name" => $l->name, "full_name" => $l->name, "user_id" => $l->id, "avatar" => 'https://graph.facebook.com/' . $l->id . '/picture', "location" => '', "description" => '', "url" => '', "is_protected" => 1, "follower_count" => 0, "post_count" => 0, "joined" => '', "found_in" => "Likes", "network" => 'facebook'); //Users are always set to network=facebook array_push($thinkup_users, $ttu); $fav_to_add = array("favoriter_id" => $l->id, "network" => $network, "author_user_id" => $p->from->id, "post_id" => $post_id); array_push($thinkup_likes, $fav_to_add); $likes_captured = $likes_captured + 1; } } $total_added_users = $total_added_users + $this->storeUsers($thinkup_users, "Likes"); $total_added_likes = $total_added_likes + $this->storeLikes($thinkup_likes); //free up memory $thinkup_users = array(); $thinkup_likes = array(); if (isset($likes_stream->paging->next)) { $api_call = str_replace('\\u00257C', '|', $likes_stream->paging->next); } } else { // no likes break; } } while (isset($likes_stream->paging->next)); } } //free up memory $thinkup_users = array(); $thinkup_likes = array(); } } if ($total_added_posts > 0) { $this->logger->logUserSuccess("Collected {$total_added_posts} posts", __METHOD__ . ',' . __LINE__); } else { $this->logger->logUserInfo("No new posts found.", __METHOD__ . ',' . __LINE__); } if ($total_added_users > 0) { $this->logger->logUserSuccess("Collected {$total_added_users} users", __METHOD__ . ',' . __LINE__); } else { $this->logger->logUserInfo("No new users found.", __METHOD__ . ',' . __LINE__); } if ($total_added_likes > 0) { $this->logger->logUserSuccess("Collected {$total_added_likes} likes", __METHOD__ . ',' . __LINE__); } else { $this->logger->logUserInfo("No new likes found.", __METHOD__ . ',' . __LINE__); } }
/** * This method, and the two supporting private methods 'maintFavsFetch' and 'archivingFavsFetch', provide the * primary crawler functionality for adding the user's favorites to the database. * For a given user, the process starts in 'archiving mode', by * working forwards from the last (oldest) page of tweets to the newest. This archiving crawl * is only done once. The crawler tries to do this all in one go, but if it exhausts the available API count, * it will continue where it left off in the next run. * Then, when page 1 is reached in archiving mode, the crawler goes into 'maintenance mode' and works * backwards from then on. It first pages back until * it has reached the last fav it previously processed. Then it searches back N more pages to catch any older * tweets that were fav'd out of chronological order, where N is determined by favs_older_pages option. * The bookkeeping for these two crawler stages is maintained in the in tu_instances entry for the user. * * Recently, the Twitter favorites API has developed some bugs that need to be worked around. The comments below * provide more detail, but in a nutshell, these methods can not currently use information from Twitter to * calculate loop termination (so a bit more work may be done than necessary), and do not currently remove un-fav'd * tweets from the database. Hopefully these API issues will be fixed by Twitter in future. */ public function fetchInstanceFavorites() { // first, check that we have the resources to do work if (!($this->api->available && $this->api->available_api_calls_for_crawler)) { $this->logger->logInfo("terminating fetchInstanceFavorites-- no API calls available", __METHOD__.','.__LINE__); return true; } $status_message = ""; //@TODO Can we get this from API? $page_size = 20; // number of favs per page retrieved from the API call $this->logger->logUserInfo("Checking for new favorites.", __METHOD__.','.__LINE__); $last_favorites_count = $this->instance->favorites_profile; $this->logger->logInfo("last favs count: $last_favorites_count", __METHOD__.','.__LINE__); $last_page_fetched_favorites = $this->instance->last_page_fetched_favorites; $last_fav_id = $this->instance->last_favorite_id; $curr_favs_count = $this->user->favorites_count; $this->logger->logInfo("curr favs count: $curr_favs_count", __METHOD__.','.__LINE__); $last_page_of_favs = round($this->api->archive_limit / $page_size); if ($last_page_fetched_favorites == "") { $last_page_fetched_favorites = 0; } $this->logger->logInfo("got last_page_fetched_favorites: $last_page_fetched_favorites", __METHOD__.','.__LINE__); if ($last_fav_id == "") { $last_fav_id = 0; } // the owner favs count, from twitter, is currently unreliable and may be less than the actual number of // favs, by a large margin. So, we still go ahead and calculate the number of 'missing' tweets based on // this info, but currently do not use it for fetch loop termination. $this->logger->logInfo("owner favs: " . $this->user->favorites_count . ", instance owner favs in system: ". $this->instance->owner_favs_in_system, __METHOD__.','.__LINE__); $favs_missing = $this->user->favorites_count - $this->instance->owner_favs_in_system; $this->logger->logInfo("favs missing: $favs_missing", __METHOD__.','.__LINE__); // figure out if we're in 'archiving' or 'maintenance' mode, via # of last_page_fetched_favorites $mode = 0; // default is archving/first-fetch if ($last_page_fetched_favorites == 1) { $mode = 1; // we are in maint. mode $new_favs_to_add = $favs_missing; $this->logger->logInfo("new favs to add/missing: $new_favs_to_add", __METHOD__.','.__LINE__); $mpage = 1; $starting_fav_id = $last_fav_id; } else { // we are in archiving mode. $new_favs_to_add = $curr_favs_count - $last_favorites_count; $this->logger->logInfo("new favs to add: $new_favs_to_add", __METHOD__.','.__LINE__); // figure out start page based on where we left off last time, and how many favs added since then $extra_pages = ceil($new_favs_to_add / $page_size); $this->logger->logInfo("extra pages: $extra_pages", __METHOD__.','.__LINE__); $finished_first_fetch = false; if ($last_page_fetched_favorites == 0) { // if at initial starting fetch (first time favs ever crawled) $last_page_fetched_favs_start = $extra_pages + 1; } else { $last_page_fetched_favs_start = $last_page_fetched_favorites + $extra_pages; } if ($last_page_fetched_favs_start > $last_page_of_favs) { $last_page_fetched_favs_start = $last_page_of_favs + 1; } } $status_message = "total last favs count: $last_favorites_count" . ", last page fetched: $last_page_fetched_favorites, last fav id: $last_fav_id"; $this->logger->logInfo($status_message, __METHOD__.','.__LINE__); $this->logger->logInfo("current favs count: $curr_favs_count" . ", new favs to add: $new_favs_to_add, last page of favs: $last_page_of_favs, mode: $mode", __METHOD__.','.__LINE__); $continue = true; $fcount = 0; $older_favs_smode = false; $stop_page = 0; $status_message = "in fetchInstanceFavorites: API available: ".$this->api->available.", avail for crawler: ". $this->api->available_api_calls_for_crawler; $this->logger->logInfo($status_message, __METHOD__.','.__LINE__); while ($this->api->available && $this->api->available_api_calls_for_crawler > 0 && $continue) { if ($mode != 0) { // in maintenance, not archiving mode list($fcount, $mpage, $older_favs_smode, $stop_page, $new_favs_to_add, $last_fav_id, $last_page_fetched_favorites, $continue) = $this->maintFavsFetch ($starting_fav_id, $fcount, $mpage, $older_favs_smode, $stop_page, $new_favs_to_add, $last_fav_id, $last_page_fetched_favorites, $continue); // } } else { // mode 0 -- archiving mode if (!$finished_first_fetch) { $this->logger->logInfo("in 'first_archiving_fetch' clause", __METHOD__.','.__LINE__); list($fcount, $last_fav_id, $last_page_fetched_favorites, $continue) = $this->archivingFavsFetch($fcount, $last_fav_id, $last_page_fetched_favs_start, $continue); $finished_first_fetch = true; } else { list($fcount, $last_fav_id, $last_page_fetched_favorites, $continue) = $this->archivingFavsFetch($fcount, $last_fav_id, $last_page_fetched_favorites, $continue); } } } // end while // update necessary instance fields $this->logger->logInfo("new_favs_to_add: $new_favs_to_add, fcount: $fcount", __METHOD__.','.__LINE__); $this->logger->logInfo("new 'last fav id': $last_fav_id", __METHOD__.','.__LINE__); $this->instance->last_favorite_id = $last_fav_id; $this->instance->last_page_fetched_favorites =$last_page_fetched_favorites; $this->instance->favorites_profile = $curr_favs_count; $this->logger->logUserSuccess("Saved $fcount new favorites.", __METHOD__.','.__LINE__); return true; }