public function crawl() { $config = Config::getInstance(); $logger = Logger::getInstance(); $instance_dao = DAOFactory::getDAO('TwitterInstanceDAO'); $owner_instance_dao = DAOFactory::getDAO('OwnerInstanceDAO'); $owner_dao = DAOFactory::getDAO('OwnerDAO'); $instance_hashtag_dao = DAOFactory::getDAO('InstanceHashtagDAO'); // get oauth values $plugin_option_dao = DAOFactory::GetDAO('PluginOptionDAO'); $options = $plugin_option_dao->getOptionsHash('twitter', true); $current_owner = $owner_dao->getByEmail(Session::getLoggedInUser()); $instances = $instance_dao->getActiveInstancesStalestFirstForOwnerByNetworkNoAuthError($current_owner, 'twitter'); foreach ($instances as $instance) { $logger->setUsername($instance->network_username); $logger->logUserSuccess("Starting to collect data for " . $instance->network_username . " on Twitter.", __METHOD__ . ',' . __LINE__); $tokens = $owner_instance_dao->getOAuthTokens($instance->id); $num_twitter_errors = isset($options['num_twitter_errors']) ? $options['num_twitter_errors']->option_value : null; $dashboard_module_cacher = new DashboardModuleCacher($instance); try { if (isset($tokens['oauth_access_token']) && $tokens['oauth_access_token'] != '' && isset($tokens['oauth_access_token_secret']) && $tokens['oauth_access_token_secret'] != '') { $archive_limit = isset($options['archive_limit']->option_value) ? $options['archive_limit']->option_value : 3200; $api = new CrawlerTwitterAPIAccessorOAuth($tokens['oauth_access_token'], $tokens['oauth_access_token_secret'], $options['oauth_consumer_key']->option_value, $options['oauth_consumer_secret']->option_value, $archive_limit, $num_twitter_errors); $twitter_crawler = new TwitterCrawler($instance, $api); $instance_dao->updateLastRun($instance->id); $twitter_crawler->fetchInstanceUserTweets(); $twitter_crawler->fetchInstanceUserMentions(); $twitter_crawler->fetchInstanceUserFriends(); $twitter_crawler->fetchInstanceUserFollowers(); $twitter_crawler->fetchInstanceUserGroups(); $twitter_crawler->fetchRetweetsOfInstanceUser(); $twitter_crawler->fetchInstanceUserFavorites(); $twitter_crawler->updateStaleGroupMemberships(); $twitter_crawler->fetchStrayRepliedToTweets(); $twitter_crawler->fetchUserFriendsByIDs(); $twitter_crawler->fetchUnloadedFriendDetails(); $twitter_crawler->fetchUnloadedFollowerDetails(); $twitter_crawler->cleanUpFollows(); $twitter_crawler->updateFriendsProfiles(); //Retrieve search results for saved keyword/hashtags $instances_hashtags = $instance_hashtag_dao->getByInstance($instance->id); foreach ($instances_hashtags as $instance_hashtag) { $twitter_crawler->fetchInstanceHashtagTweets($instance_hashtag); } } else { throw new Exception('Missing Twitter OAuth tokens.'); } } catch (Exception $e) { $logger->logUserError(get_class($e) . " while crawling " . $instance->network_username . " on Twitter: " . $e->getMessage(), __METHOD__ . ',' . __LINE__); } $dashboard_module_cacher->cacheDashboardModules(); // Save instance if (isset($twitter_crawler->user)) { $instance_dao->save($instance, $twitter_crawler->user->post_count, $logger); } Reporter::reportVersion($instance); $logger->logUserSuccess("Finished collecting data for " . $instance->network_username . " on Twitter.", __METHOD__ . ',' . __LINE__); } }
public function crawl() { global $db; global $conn; $config = Config::getInstance(); $logger = Logger::getInstance(); $id = DAOFactory::getDAO('InstanceDAO'); $oid = new OwnerInstanceDAO($db, $logger); $instances = $id->getAllActiveInstancesStalestFirstByNetwork('twitter'); foreach ($instances as $instance) { $logger->setUsername($instance->network_username); $tokens = $oid->getOAuthTokens($instance->id); $noauth = true; if (isset($tokens['oauth_access_token']) && $tokens['oauth_access_token'] != '' && isset($tokens['oauth_access_token_secret']) && $tokens['oauth_access_token_secret'] != '') { $noauth = false; } if ($noauth) { $api = new CrawlerTwitterAPIAccessorOAuth('NOAUTH', 'NOAUTH', $config->getValue('oauth_consumer_key'), $config->getValue('oauth_consumer_secret'), $instance, $config->getValue('archive_limit')); } else { $api = new CrawlerTwitterAPIAccessorOAuth($tokens['oauth_access_token'], $tokens['oauth_access_token_secret'], $config->getValue('oauth_consumer_key'), $config->getValue('oauth_consumer_secret'), $instance, $config->getValue('archive_limit')); } $crawler = new TwitterCrawler($instance, $api, $db); $api->init(); if ($api->available_api_calls_for_crawler > 0) { $id->updateLastRun($instance->id); // No auth req'd $crawler->fetchInstanceUserInfo(); // No auth for public Twitter users $crawler->fetchInstanceUserTweets(); if (!$noauth) { // Auth req'd, for calling user only $crawler->fetchInstanceUserMentions(); $crawler->fetchRetweetsOfInstanceUser(); $crawler->fetchInstanceUserFriends(); $crawler->fetchInstanceUserFollowers(); } $crawler->fetchStrayRepliedToTweets(); $crawler->fetchUnloadedFollowerDetails(); $crawler->fetchFriendTweetsAndFriends(); // TODO: Get direct messages // TODO: Gather favorites data if ($noauth) { // No auth req'd $crawler->fetchSearchResults($instance->network_username); } $crawler->cleanUpFollows(); // Save instance $id->save($crawler->instance, $crawler->owner_object->post_count, $logger, $api); } } $logger->close(); # Close logging }
public function testFetchRetweetsOfInstanceuserBudget() { self::setUpInstanceUserGinaTrapani(); // set up crawl limit budget $crawl_limit = array('fetchUserTimelineForRetweet' => array('count' => 2, 'remaining' => 0)); $this->api->setCallerLimits($crawl_limit); $twitter_crawler = new TwitterCrawler($this->instance, $this->api); $twitter_crawler->fetchInstanceUserInfo(); $builder = FixtureBuilder::build('posts', array('post_id' => '14947487415', 'author_user_id' => '930061', 'author_username' => 'ginatrapani', 'author_fullname' => 'Gina Trapani', 'post_text' => '"Wearing your new conference tee shirt does NOT count as dressing up."', 'pub_date' => '-1d', 'reply_count_cache' => 1, 'old_retweet_count_cache' => 0, 'retweet_count_cache' => 0, 'retweet_count_api' => 0)); $post_dao = DAOFactory::getDAO('PostDAO'); $twitter_crawler->fetchRetweetsOfInstanceUser(); $post = $post_dao->getPost('14947487415', 'twitter'); $this->assertEqual($post->retweet_count_cache, 0, '0 new-style retweets from cache count'); }
public function testFetchRetweetsOfInstanceuser() { self::setUpInstanceUserGinaTrapani(); $tc = new TwitterCrawler($this->instance, $this->api); $tc->fetchInstanceUserInfo(); //first, load retweeted tweet into db // we now get the 'new-style' retweet count from the retweet_count field in the xml, // which is parsed into 'retweet_count_cache' in the post vals. This will not necessarily match // the number of retweets in the database any more (but does in this test case). $builder = FixtureBuilder::build('posts', array('post_id'=>14947487415, 'author_user_id'=>930061, 'author_username'=>'ginatrapani', 'author_fullname'=>'Gina Trapani', 'post_text'=> '"Wearing your new conference tee shirt does NOT count as dressing up."', 'pub_date'=>'-1d', 'reply_count_cache'=>1, 'old_retweet_count_cache'=>0, 'retweet_count_cache'=>3)); $pdao = DAOFactory::getDAO('PostDAO'); $tc->fetchRetweetsOfInstanceUser(); $post = $pdao->getPost(14947487415, 'twitter'); $this->assertEqual($post->retweet_count_cache, 3, '3 new-style retweets detected'); $retweets = $pdao->getRetweetsOfPost(14947487415, 'twitter', true); $this->assertEqual(sizeof($retweets), 3, '3 retweets loaded'); //make sure duplicate posts aren't going into the db on next crawler run self::setUpInstanceUserGinaTrapani(); $tc = new TwitterCrawler($this->instance, $this->api); $tc->fetchInstanceUserInfo(); $tc->fetchRetweetsOfInstanceUser(); $post = $pdao->getPost(14947487415, 'twitter'); $this->assertEqual($post->retweet_count_cache, 3, '3 new-style retweets detected'); $retweets = $pdao->getRetweetsOfPost(14947487415, 'twitter', true); $this->assertEqual(sizeof($retweets), 3, '3 retweets loaded'); $post = $pdao->getPost(12722783896, 'twitter'); $rts2 = $pdao->getRetweetsOfPost(12722783896, 'twitter', true); $this->assertEqual(sizeof($rts2), 1, '1 retweet loaded'); $this->assertEqual($rts2[0]->in_rt_of_user_id, 930061); }
public function crawl() { $config = Config::getInstance(); $logger = Logger::getInstance(); $instance_dao = DAOFactory::getDAO('TwitterInstanceDAO'); $owner_instance_dao = DAOFactory::getDAO('OwnerInstanceDAO'); $owner_dao = DAOFactory::getDAO('OwnerDAO'); // get oauth values $plugin_option_dao = DAOFactory::GetDAO('PluginOptionDAO'); $options = $plugin_option_dao->getOptionsHash('twitter', true); $current_owner = $owner_dao->getByEmail(Session::getLoggedInUser()); $instances = $instance_dao->getAllActiveInstancesStalestFirstByNetwork('twitter'); foreach ($instances as $instance) { if (!$owner_instance_dao->doesOwnerHaveAccess($current_owner, $instance)) { // Owner doesn't have access to this instance; let's not crawl it. continue; } $logger->setUsername($instance->network_username); $logger->logUserSuccess("Starting to collect data for " . $instance->network_username . " on Twitter.", __METHOD__ . ',' . __LINE__); $tokens = $owner_instance_dao->getOAuthTokens($instance->id); $noauth = true; $num_twitter_errors = isset($options['num_twitter_errors']) ? $options['num_twitter_errors']->option_value : null; $max_api_calls_per_crawl = isset($options['max_api_calls_per_crawl']) ? $options['max_api_calls_per_crawl']->option_value : 350; if (isset($tokens['oauth_access_token']) && $tokens['oauth_access_token'] != '' && isset($tokens['oauth_access_token_secret']) && $tokens['oauth_access_token_secret'] != '') { $noauth = false; } $api_calls_to_leave_unmade_per_minute = isset($options['api_calls_to_leave_unmade_per_minute']) ? $options['api_calls_to_leave_unmade_per_minute']->option_value : 2.0; if ($noauth) { $api = new CrawlerTwitterAPIAccessorOAuth('NOAUTH', 'NOAUTH', $options['oauth_consumer_key']->option_value, $options['oauth_consumer_secret']->option_value, $api_calls_to_leave_unmade_per_minute, $options['archive_limit']->option_value, $num_twitter_errors, $max_api_calls_per_crawl); } else { $api = new CrawlerTwitterAPIAccessorOAuth($tokens['oauth_access_token'], $tokens['oauth_access_token_secret'], $options['oauth_consumer_key']->option_value, $options['oauth_consumer_secret']->option_value, $api_calls_to_leave_unmade_per_minute, $options['archive_limit']->option_value, $num_twitter_errors, $max_api_calls_per_crawl); } $crawler = new TwitterCrawler($instance, $api); $api->init(); if ($api->available_api_calls_for_crawler > 0) { $instance_dao->updateLastRun($instance->id); // No auth req'd //$crawler->fetchInstanceUserInfo(); // No auth for public Twitter users $crawler->fetchInstanceUserTweets(); if (!$noauth) { // Auth req'd, for calling user only $crawler->fetchInstanceUserMentions(); $crawler->fetchInstanceUserFriends(); $crawler->fetchInstanceFavorites(); $crawler->fetchInstanceUserFollowers(); $crawler->fetchRetweetsOfInstanceUser(); $crawler->cleanUpMissedFavsUnFavs(); } $crawler->fetchStrayRepliedToTweets(); $crawler->fetchUnloadedFollowerDetails(); $crawler->fetchFriendTweetsAndFriends(); if ($noauth) { // No auth req'd $crawler->fetchSearchResults($instance->network_username); } $crawler->cleanUpFollows(); // Save instance if (isset($crawler->user)) { $instance_dao->save($instance, $crawler->user->post_count, $logger); } $logger->logUserSuccess("Finished collecting data for " . $instance->network_username . " on Twitter.", __METHOD__ . ',' . __LINE__); } } }
public function testFetchRetweetsOfInstanceuser() { self::setUpInstanceUserGinaTrapani(); $tc = new TwitterCrawler($this->instance, $this->api); $tc->fetchInstanceUserInfo(); //first, load retweeted tweet into db $q = "INSERT INTO tu_posts (post_id, author_user_id, author_username, author_fullname, author_avatar,\n post_text, source, pub_date, reply_count_cache, retweet_count_cache) VALUES (14947487415, 930061, \n 'ginatrapani', 'Gina Trapani', 'avatar.jpg', \n '"Wearing your new conference tee shirt does NOT count as dressing up."', 'web', \n '2006-01-01 00:00:00', " . rand(0, 4) . ", 0);"; $this->db->exec($q); $pdao = DAOFactory::getDAO('PostDAO'); $tc->fetchRetweetsOfInstanceUser(); $post = $pdao->getPost(14947487415.0, 'twitter'); $this->assertEqual($post->retweet_count_cache, 3, '3 retweets loaded'); $retweets = $pdao->getRetweetsOfPost(14947487415.0, 'twitter', true); $this->assertEqual(sizeof($retweets), 3, '3 retweets loaded'); //make sure duplicate posts aren't going into the db on next crawler run self::setUpInstanceUserGinaTrapani(); $tc = new TwitterCrawler($this->instance, $this->api); $tc->fetchInstanceUserInfo(); $tc->fetchRetweetsOfInstanceUser(); $post = $pdao->getPost(14947487415.0, 'twitter'); $this->assertEqual($post->retweet_count_cache, 3, '3 retweets loaded'); $retweets = $pdao->getRetweetsOfPost(14947487415.0, 'twitter', true); $this->assertEqual(sizeof($retweets), 3, '3 retweets loaded'); }
public function testFetchRetweetsOfInstanceuser() { self::setUpInstanceUserGinaTrapani(); $tc = new TwitterCrawler($this->instance, $this->api); $tc->fetchInstanceUserInfo(); //first, load retweeted tweet into db // we now get the 'new-style' retweet count from the retweet_count field in the xml, // which is parsed into 'retweet_count_cache' in the post vals. This will not necessarily match // the number of retweets in the database any more (but does in this test case). $builder = FixtureBuilder::build('posts', array('post_id' => 14947487415.0, 'author_user_id' => 930061, 'author_username' => 'ginatrapani', 'author_fullname' => 'Gina Trapani', 'post_text' => '"Wearing your new conference tee shirt does NOT count as dressing up."', 'pub_date' => '-1d', 'reply_count_cache' => 1, 'old_retweet_count_cache' => 0, 'retweet_count_cache' => 0, 'retweet_count_api' => 0)); $pdao = DAOFactory::getDAO('PostDAO'); $tc->fetchRetweetsOfInstanceUser(); $post = $pdao->getPost(14947487415.0, 'twitter'); $this->assertEqual($post->retweet_count_cache, 3, '3 new-style retweets from cache count'); // in processing the retweets of the post, if they contain a <retweeted_status> element pointing // to the original post, and that original post information includes a retweet count, we will update the // original post in the db with that count. In this test data that count is 2, 'behind' the database info. $this->assertEqual($post->retweet_count_api, 2, '2 new-style retweets count from API'); // should not have processed any old-style retweets here $this->assertEqual($post->old_retweet_count_cache, 0, '0 old-style retweets count from API'); $retweets = $pdao->getRetweetsOfPost(14947487415.0, 'twitter', true); $this->assertEqual(sizeof($retweets), 3, '3 retweets loaded'); //make sure duplicate posts aren't going into the db on next crawler run self::setUpInstanceUserGinaTrapani(); $tc = new TwitterCrawler($this->instance, $this->api); $tc->fetchInstanceUserInfo(); $tc->fetchRetweetsOfInstanceUser(); $post = $pdao->getPost(14947487415.0, 'twitter'); $this->assertEqual($post->retweet_count_cache, 3, '3 new-style retweets detected'); $this->assertEqual($post->retweet_count_api, 2, '2 new-style retweets count from API'); $retweets = $pdao->getRetweetsOfPost(14947487415.0, 'twitter', true); $this->assertEqual(sizeof($retweets), 3, '3 retweets loaded'); $post = $pdao->getPost(12722783896.0, 'twitter'); $rts2 = $pdao->getRetweetsOfPost(12722783896.0, 'twitter', true); $this->assertEqual(sizeof($rts2), 1, '1 retweet loaded'); $this->assertEqual($rts2[0]->in_rt_of_user_id, 930061); }
public function testFetchRetweetsOfInstanceUser() { $this->debug(__METHOD__); self::setUpInstanceUserGinaTrapani(); $twitter_crawler = new TwitterCrawler($this->instance, $this->api); $twitter_crawler->api->to->setDataPathFolder('testoftwittercrawler/ginatrapani/'); //first, load retweeted tweet into db // we now get the 'new-style' retweet count from the retweet_count field in the xml, // which is parsed into 'retweet_count_cache' in the post vals. This will not necessarily match // the number of retweets in the database any more (but does in this test case). $builder = FixtureBuilder::build('posts', array('post_id' => '300000912989118466', 'author_user_id' => '930061', 'author_username' => 'ginatrapani', 'author_fullname' => 'Gina Trapani', 'post_text' => '@jjg unsurprisingly Dykes Lumber in Brooklyn has a thriving t-shirt business', 'pub_date' => '-1d', 'reply_count_cache' => 1, 'old_retweet_count_cache' => 0, 'retweet_count_cache' => 0, 'retweet_count_api' => 0)); $post_dao = DAOFactory::getDAO('PostDAO'); $twitter_crawler->fetchRetweetsOfInstanceUser(); $post = $post_dao->getPost('300000912989118466', 'twitter'); $this->assertEqual($post->retweet_count_cache, 1, '1 new-style retweet from count cache'); // in processing the retweets of the post, if they contain a <retweeted_status> element pointing // to the original post, and that original post information includes a retweet count, we will update the // original post in the db with that count. In this test data that count is 2, 'behind' the database info. $this->assertEqual($post->retweet_count_api, 1, '1 new-style retweet count from API'); // should not have processed any old-style retweets here $this->assertEqual($post->old_retweet_count_cache, 0, '0 old-style retweets count from API'); $retweets = $post_dao->getRetweetsOfPost('300000311127457792', 'twitter', true); $this->assertEqual(sizeof($retweets), 0, '0 retweets loaded'); //make sure duplicate posts aren't going into the db on next crawler run self::setUpInstanceUserGinaTrapani(); $twitter_crawler = new TwitterCrawler($this->instance, $this->api); $twitter_crawler->api->to->setDataPathFolder('testoftwittercrawler/ginatrapani/'); $twitter_crawler->fetchInstanceUserInfo(); $twitter_crawler->fetchRetweetsOfInstanceUser(); $post = $post_dao->getPost('300000912989118466', 'twitter'); $this->assertEqual($post->retweet_count_cache, 1, '1 new-style retweet from count cache'); $this->assertEqual($post->retweet_count_api, 1, '1 new-style retweet count from API'); $retweets = $post_dao->getRetweetsOfPost('300000912989118466', 'twitter', true); $this->assertEqual(sizeof($retweets), 0, '0 retweets loaded'); $post = $post_dao->getPost('300000311127457792', 'twitter'); $rts2 = $post_dao->getRetweetsOfPost('300000311127457792', 'twitter', true); $this->assertEqual(sizeof($rts2), 0, '0 retweets loaded'); //$this->assertEqual($rts2[0]->in_rt_of_user_id, '930061'); }
public function crawl() { $config = Config::getInstance(); $logger = Logger::getInstance(); $id = DAOFactory::getDAO('InstanceDAO'); $oid = DAOFactory::getDAO('OwnerInstanceDAO'); $od = DAOFactory::getDAO('OwnerDAO'); // get oauth values $plugin_option_dao = DAOFactory::GetDAO('PluginOptionDAO'); $options = $plugin_option_dao->getOptionsHash('twitter', true); $current_owner = $od->getByEmail(Session::getLoggedInUser()); $instances = $id->getAllActiveInstancesStalestFirstByNetwork('twitter'); foreach ($instances as $instance) { if (!$oid->doesOwnerHaveAccess($current_owner, $instance)) { // Owner doesn't have access to this instance; let's not crawl it. continue; } $logger->setUsername($instance->network_username); $tokens = $oid->getOAuthTokens($instance->id); $noauth = true; $num_twitter_errors = isset($options['num_twitter_errors']) ? $options['num_twitter_errors']->option_value : null; if (isset($tokens['oauth_access_token']) && $tokens['oauth_access_token'] != '' && isset($tokens['oauth_access_token_secret']) && $tokens['oauth_access_token_secret'] != '') { $noauth = false; } if ($noauth) { $api = new CrawlerTwitterAPIAccessorOAuth('NOAUTH', 'NOAUTH', $options['oauth_consumer_key']->option_value, $options['oauth_consumer_secret']->option_value, $instance, $options['archive_limit']->option_value, $num_twitter_errors); } else { $api = new CrawlerTwitterAPIAccessorOAuth($tokens['oauth_access_token'], $tokens['oauth_access_token_secret'], $options['oauth_consumer_key']->option_value, $options['oauth_consumer_secret']->option_value, $instance, $options['archive_limit']->option_value, $num_twitter_errors); } $crawler = new TwitterCrawler($instance, $api); $api->init(); if ($api->available_api_calls_for_crawler > 0) { $id->updateLastRun($instance->id); // No auth req'd //$crawler->fetchInstanceUserInfo(); // No auth for public Twitter users $crawler->fetchInstanceUserTweets(); if (!$noauth) { // Auth req'd, for calling user only $crawler->fetchInstanceUserMentions(); $crawler->fetchRetweetsOfInstanceUser(); $crawler->fetchInstanceUserFriends(); $crawler->fetchInstanceUserFollowers(); } $crawler->fetchStrayRepliedToTweets(); $crawler->fetchUnloadedFollowerDetails(); $crawler->fetchFriendTweetsAndFriends(); //@TODO Gather favorites data if ($noauth) { // No auth req'd $crawler->fetchSearchResults($instance->network_username); } $crawler->cleanUpFollows(); // Save instance if (isset($crawler->owner_object)) { $id->save($instance, $crawler->owner_object->post_count, $logger); } } } $logger->close(); # Close logging }