function process_json_file_timeline($filepath, $dbh) { global $tweets_processed, $tweets_failed, $tweets_success, $valid_timeline, $empty_timeline, $invalid_timeline, $populated_timeline, $total_timeline, $all_tweet_ids, $all_users, $bin_name; $tweetQueue = new TweetQueue(); $total_timeline++; ini_set('auto_detect_line_endings', true); $handle = @fopen($filepath, "r"); if ($handle) { while (($buffer = fgets($handle, 40960)) !== false) { $tweet = json_decode($buffer, true); //var_export($tweet); print "\n\n"; $buffer = ""; $t = new Tweet(); $t->fromJSON($tweet); if (!$t->isInBin($bin_name)) { $tweetQueue->push($t, $bin_name); if ($tweetQueue->length() > 100) { $tweetQueue->insertDB(); } $all_users[] = $t->from_user_id; $all_tweet_ids[] = $t->id; $tweets_processed++; } print "."; } if (!feof($handle)) { echo "Error: unexpected fgets() fail\n"; } fclose($handle); } if ($tweetQueue->length() > 0) { $tweetQueue->insertDB(); } }
function search($idlist) { global $twitter_keys, $current_key, $all_users, $all_tweet_ids, $bin_name, $dbh, $tweetQueue; $keyinfo = getRESTKey(0); $current_key = $keyinfo['key']; $ratefree = $keyinfo['remaining']; print "current key {$current_key} ratefree {$ratefree}\n"; $tmhOAuth = new tmhOAuth(array('consumer_key' => $twitter_keys[$current_key]['twitter_consumer_key'], 'consumer_secret' => $twitter_keys[$current_key]['twitter_consumer_secret'], 'token' => $twitter_keys[$current_key]['twitter_user_token'], 'secret' => $twitter_keys[$current_key]['twitter_user_secret'])); // by hundred for ($i = 0; $i < sizeof($idlist); $i += 100) { if ($ratefree <= 0 || $ratefree % 10 == 0) { $keyinfo = getRESTKey($current_key); $current_key = $keyinfo['key']; $ratefree = $keyinfo['remaining']; $tmhOAuth = new tmhOAuth(array('consumer_key' => $twitter_keys[$current_key]['twitter_consumer_key'], 'consumer_secret' => $twitter_keys[$current_key]['twitter_consumer_secret'], 'token' => $twitter_keys[$current_key]['twitter_user_token'], 'secret' => $twitter_keys[$current_key]['twitter_user_secret'])); } $q = $idlist[$i]; $n = $i + 1; while ($n < $i + 100) { if (!isset($idlist[$n])) { break; } $q .= "," . $idlist[$n]; $n++; } $params = array('id' => $q); $code = $tmhOAuth->user_request(array('method' => 'GET', 'url' => $tmhOAuth->url('1.1/statuses/lookup'), 'params' => $params)); $ratefree--; if ($tmhOAuth->response['code'] == 200) { $data = json_decode($tmhOAuth->response['response'], true); if (is_array($data) && empty($data)) { // all tweets in set are deleted continue; } $tweets = $data; $tweet_ids = array(); foreach ($tweets as $tweet) { $t = new Tweet(); $t->fromJSON($tweet); if (!$t->isInBin($bin_name)) { $all_users[] = $t->from_user_id; $all_tweet_ids[] = $t->id; $tweet_ids[] = $t->id; $tweetQueue->push($t, $bin_name); } print "."; } sleep(1); } else { echo "Failure with code " . $tmhOAuth->response['response']['code'] . "\n"; var_dump($tmhOAuth->response['response']['info']); var_dump($tmhOAuth->response['response']['error']); var_dump($tmhOAuth->response['response']['errno']); die; } $tweetQueue->insertDB(); } }
function search($keywords, $max_id = null) { global $twitter_keys, $current_key, $ratefree, $bin_name, $dbh, $tweetQueue; $ratefree--; if ($ratefree < 1 || $ratefree % 10 == 0) { $keyinfo = getRESTKey($current_key, 'search', 'tweets'); $current_key = $keyinfo['key']; $ratefree = $keyinfo['remaining']; } $tmhOAuth = new tmhOAuth(array('consumer_key' => $twitter_keys[$current_key]['twitter_consumer_key'], 'consumer_secret' => $twitter_keys[$current_key]['twitter_consumer_secret'], 'token' => $twitter_keys[$current_key]['twitter_user_token'], 'secret' => $twitter_keys[$current_key]['twitter_user_secret'])); $params = array('q' => $keywords, 'count' => 100); if (isset($max_id)) { $params['max_id'] = $max_id; } $code = $tmhOAuth->user_request(array('method' => 'GET', 'url' => $tmhOAuth->url('1.1/search/tweets'), 'params' => $params)); if ($tmhOAuth->response['code'] == 200) { $data = json_decode($tmhOAuth->response['response'], true); $tweets = $data['statuses']; $tweet_ids = array(); foreach ($tweets as $tweet) { $t = new Tweet(); $t->fromJSON($tweet); $tweet_ids[] = $t->id; if (!$t->isInBin($bin_name)) { $tweetQueue->push($t, $bin_name); if ($tweetQueue->length() > 100) { $tweetQueue->insertDB(); } print "."; } } if (!empty($tweet_ids)) { print "\n"; if (count($tweet_ids) <= 1) { print "no more tweets found\n\n"; return false; } $max_id = min($tweet_ids); print "max id: " . $max_id . "\n"; } else { print "0 tweets found\n\n"; return false; } sleep(1); search($keywords, $max_id); } else { echo $tmhOAuth->response['response'] . "\n"; if ($tmhOAuth->response['response']['errors']['code'] == 130) { // over capacity sleep(1); search($keywords, $max_id); } } }
function process_json_file_timeline($filepath, $dbh) { global $tweets_processed, $tweets_failed, $tweets_success, $valid_timeline, $empty_timeline, $invalid_timeline, $populated_timeline, $total_timeline, $all_tweet_ids, $all_users, $bin_name; $tweetQueue = new TweetQueue(); $total_timeline++; $filestr = file_get_contents($filepath); // sylvester stores multiple json exports in the same file, // in order to decode it we will need to split it into its respective individual exports $jsons = explode("}][{", $filestr); print count($jsons) . " jsons found\n"; foreach ($jsons as $json) { if (substr($json, 0, 2) != "[{") { $json = "[{" . $json; } if (substr($json, -2) != "}]") { $json = $json . "}]"; } $timeline = json_decode($json); if (is_array($timeline)) { $valid_timeline++; if (!empty($timeline)) { $populated_timeline++; } else { $empty_timeline++; } } else { $invalid_timeline++; } foreach ($timeline as $tweet) { $t = new Tweet(); $t->fromJSON($tweet); if (!$t->isInBin($bin_name)) { $tweetQueue->push($t, $bin_name); if ($tweetQueue->length() > 100) { $tweetQueue->insertDB(); } $all_users[] = $t->user->id; $all_tweet_ids[] = $t->id; $tweets_processed++; } } } if ($tweetQueue->length() > 0) { $tweetQueue->insertDB(); } }
function processtweets($capturebucket) { global $tweetQueue; $querybins = getActiveBins(); // cache bin types $bintypes = array(); foreach ($querybins as $binname => $queries) { $bintypes[$binname] = getBinType($binname); } // running through every single tweet foreach ($capturebucket as $data) { if (!array_key_exists('entities', $data)) { // unexpected/irregular tweet data if (array_key_exists('delete', $data)) { // a tweet has been deleted. @todo: process continue; } // this can get very verbose when repeated? logit(CAPTURE . ".error.log", "irregular tweet data received."); continue; } // we run through every bin to check whether the received tweets fit foreach ($querybins as $binname => $queries) { $geobin = isset($bintypes[$binname]) && $bintypes[$binname] == 'geotrack'; if ($geobin && (!array_key_exists('geo_enabled', $data['user']) || $data['user']['geo_enabled'] !== true)) { // in geobins, process only geo tweets continue; } $found = false; if (CAPTURE == "track") { // we check for every query in the bin if they fit foreach ($queries as $query => $track) { if ($geobin) { $boxes = getGeoBoxes($track); // look for geolocation matches /* * Some notes on geolocation tracking * * Geolocation tracking is done inside the capture role: track * Geolocation query bins have a special type: geotrack * Geolocation phrases have a specific format: * = these phrases are a chain of geoboxes defined as 4 comma separated values (sw long, sw lat, ne long, ne lat) * = multiple world areas can thus be defined per bin * * Fetching (from Twitter) * * 1) Twitter will give us all the tweets which have excplicit GPS coordinates inside one of our queried areas. * 2) Additionaly Twitter give us those tweets with a user 'place' definition. A place (i.e. Paris) is itself a (set of) gps polygons * Twitter returns the tweets if one of these place polygons covers the same area as our geo boxes. * * And matching (by us) * * 1) These tweets will be put in the bin if the coordinate pair (longitude, latitude) fits in any one of the defined geoboxes in the bin. * 2) These tweets will be put in the bin if the geobox is _not_ completely subsumed by the place (example: the place is France and the geobox is Paris), but the geobox does overlap the place polygon or the geobox subsumes the place polygon. * */ if ($data["geo"] != null) { $tweet_lat = $data["geo"]["coordinates"][0]; $tweet_lng = $data["geo"]["coordinates"][1]; // does the tweet geo data fit in on of the boxes? foreach ($boxes as $box) { if (coordinatesInsideBoundingBox($tweet_lng, $tweet_lat, $box['sw_lng'], $box['sw_lat'], $box['ne_lng'], $box['ne_lat'])) { // logit(CAPTURE . ".error.log", "(debug) tweet with lng $tweet_lng and lat $tweet_lat versus (sw: " . $box['sw_lng'] . "," . $box['sw_lat'] . " ne: " . $box['ne_lng'] . "," . $box['ne_lat'] . ") matched to be inside the area"); $found = true; break; } else { // logit(CAPTURE . ".error.log", "(debug) tweet with lng $tweet_lng and lat $tweet_lat versus (sw: " . $box['sw_lng'] . "," . $box['sw_lat'] . " ne: " . $box['ne_lng'] . "," . $box['ne_lat'] . ") falls outside the area"); } } } else { // this is a gps tracking query, but the tweet has no gps geo data // Twitter may have matched this tweet based on the user-defined location data if (array_key_exists('place', $data) && is_array($data['place']) && array_key_exists('bounding_box', $data['place'])) { // Make a geoPHP object of the polygon(s) defining the place, by using a WKT (well-known text) string $wkt = 'POLYGON('; $polfirst = true; foreach ($data['place']['bounding_box']['coordinates'] as $p => $pol) { if ($polfirst) { $polfirst = false; } else { $wkt .= ', '; } $wkt .= '('; $first = true; $first_lng = 0; $first_lat = 0; foreach ($data['place']['bounding_box']['coordinates'][$p] as $i => $coords) { $point_lng = $coords[0]; $point_lat = $coords[1]; if ($first) { $first = false; $first_lng = $point_lng; $first_lat = $point_lat; } else { $wkt .= ', '; } $wkt .= $point_lng . ' ' . $point_lat; } // end where we started $wkt .= ', ' . $first_lng . ' ' . $first_lat; $wkt .= ')'; } $wkt .= ')'; $place = geoPHP::load($wkt, 'wkt'); // iterate over geoboxes in our track // place should not spatially contain our box, but it should overlap with it foreach ($boxes as $box) { // 'POLYGON((x1 y1, x1 y2, x2 y2, x2 y1, x1 y1))' $boxwkt = 'POLYGON((' . $box['sw_lng'] . ' ' . $box['sw_lat'] . ', ' . $box['sw_lng'] . ' ' . $box['ne_lat'] . ', ' . $box['ne_lng'] . ' ' . $box['ne_lat'] . ', ' . $box['ne_lng'] . ' ' . $box['sw_lat'] . ', ' . $box['sw_lng'] . ' ' . $box['sw_lat'] . '))'; $versus = geoPHP::load($boxwkt, 'wkt'); $contains = $place->contains($versus); $boxcontains = $versus->contains($place); $overlaps = $place->overlaps($versus); if (!$contains && ($boxcontains || $overlaps)) { // logit(CAPTURE . ".error.log", "place polygon $wkt allies with geobox $boxwkt"); $found = true; break; } } } } if ($found) { break; } } else { // look for keyword matches $pass = false; // check for queries with more than one word, but go around quoted queries if (preg_match("/ /", $query) && !preg_match("/'/", $query)) { $tmplist = explode(" ", $query); $all = true; foreach ($tmplist as $tmp) { if (!preg_match("/" . $tmp . "/i", $data["text"])) { $all = false; break; } } // only if all words are found if ($all == true) { $pass = true; } } else { // treat quoted queries as single words $query = preg_replace("/'/", "", $query); if (preg_match("/" . $query . "/i", $data["text"])) { $pass = true; } } // at the first fitting query, we break if ($pass == true) { $found = true; break; } } } } elseif (CAPTURE == "follow") { // we check for every query in the bin if they fit $found = in_array($data["user"]["id"], $queries) ? TRUE : FALSE; } elseif (CAPTURE == "onepercent") { // always match in onepercent $found = true; } // if the tweet does not fit in the current bin, go to the next tweet if ($found == false) { continue; } $tweet = new Tweet(); $tweet->fromJSON($data); $tweetQueue->push($tweet, $binname); } } $tweetQueue->insertDB(); return TRUE; }
function get_timeline($user_id, $type, $max_id = null) { print "doing {$user_id}\n"; global $twitter_keys, $current_key, $ratefree, $looped, $bin_name, $dbh, $tweetQueue; $ratefree--; if ($ratefree < 1 || $ratefree % 10 == 0) { $keyinfo = getRESTKey($current_key, 'statuses', 'user_timeline'); $current_key = $keyinfo['key']; $ratefree = $keyinfo['remaining']; } $tmhOAuth = new tmhOAuth(array('consumer_key' => $twitter_keys[$current_key]['twitter_consumer_key'], 'consumer_secret' => $twitter_keys[$current_key]['twitter_consumer_secret'], 'token' => $twitter_keys[$current_key]['twitter_user_token'], 'secret' => $twitter_keys[$current_key]['twitter_user_secret'])); $params = array('count' => 200, 'trim_user' => false, 'exclude_replies' => false, 'contributor_details' => true, 'include_rts' => 1); if ($type == "user_id") { $params['user_id'] = $user_id; } else { $params['screen_name'] = $user_id; } if (isset($max_id)) { $params['max_id'] = $max_id; } $tmhOAuth->user_request(array('method' => 'GET', 'url' => $tmhOAuth->url('1.1/statuses/user_timeline'), 'params' => $params)); //var_export($params); print "\n"; if ($tmhOAuth->response['code'] == 200) { $tweets = json_decode($tmhOAuth->response['response'], true); // store in db $tweet_ids = array(); foreach ($tweets as $tweet) { $t = new Tweet(); $t->fromJSON($tweet); $tweet_ids[] = $t->id; if (!$t->isInBin($bin_name)) { $tweetQueue->push($t, $bin_name); print "."; if ($tweetQueue->length() > 100) { $tweetQueue->insertDB(); } } } if (!empty($tweet_ids)) { print "\n"; if (count($tweet_ids) <= 1) { print "no more tweets found\n\n"; return false; } $max_id = min($tweet_ids); print "max id: " . $max_id . "\n"; } else { print "0 tweets found\n\n"; return false; } sleep(1); get_timeline($user_id, $type, $max_id); } else { $error_code = json_decode($tmhOAuth->response['response'])->errors[0]->code; if ($error_code == 130) { print "Twitter is over capacity, sleeping 5 seconds before retry\n"; sleep(5); get_timeline($user_id, $type, $max_id); } elseif ($error_code == 88) { print "API key rate limit exceeded, sleeping 60 seconds before retry\n"; sleep(60); get_timeline($user_id, $type, $max_id); } else { echo "\nAPI error: " . $tmhOAuth->response['response'] . "\n"; } } }
function search($idlist) { global $twitter_keys, $current_key, $all_users, $all_tweet_ids, $bin_name, $dbh, $tweetQueue; $keyinfo = getRESTKey(0); $current_key = $keyinfo['key']; $ratefree = $keyinfo['remaining']; print "\ncurrent key {$current_key} ratefree {$ratefree}\n"; $tmhOAuth = new tmhOAuth(array('consumer_key' => $twitter_keys[$current_key]['twitter_consumer_key'], 'consumer_secret' => $twitter_keys[$current_key]['twitter_consumer_secret'], 'token' => $twitter_keys[$current_key]['twitter_user_token'], 'secret' => $twitter_keys[$current_key]['twitter_user_secret'])); // by hundred for ($i = 0; $i < sizeof($idlist); $i += 100) { if ($ratefree <= 0 || $ratefree % 10 == 0) { print "\n"; $keyinfo = getRESTKey($current_key); $current_key = $keyinfo['key']; $ratefree = $keyinfo['remaining']; $tmhOAuth = new tmhOAuth(array('consumer_key' => $twitter_keys[$current_key]['twitter_consumer_key'], 'consumer_secret' => $twitter_keys[$current_key]['twitter_consumer_secret'], 'token' => $twitter_keys[$current_key]['twitter_user_token'], 'secret' => $twitter_keys[$current_key]['twitter_user_secret'])); } $q = $idlist[$i]; $n = $i + 1; while ($n < $i + 100) { if (!isset($idlist[$n])) { break; } $q .= "," . $idlist[$n]; $n++; } $params = array('id' => $q); $code = $tmhOAuth->user_request(array('method' => 'GET', 'url' => $tmhOAuth->url('1.1/statuses/lookup'), 'params' => $params)); $ratefree--; $reset_connection = false; if ($tmhOAuth->response['code'] == 200) { $data = json_decode($tmhOAuth->response['response'], true); if (is_array($data) && empty($data)) { // all tweets in set are deleted continue; } $tweets = $data; $tweet_ids = array(); foreach ($tweets as $tweet) { $t = new Tweet(); $t->fromJSON($tweet); if (!$t->isInBin($bin_name)) { $all_users[] = $t->from_user_id; $all_tweet_ids[] = $t->id; $tweet_ids[] = $t->id; $tweetQueue->push($t, $bin_name); } print "."; } sleep(1); $retries = 0; // reset retry counter on success } else { if ($retries < 4 && $tmhOAuth->response['code'] == 503) { /* this indicates problems on the Twitter side, such as overcapacity. we slow down and retry the connection */ print "!"; sleep(7); $i--; // rewind $retries++; $reset_connection = true; } else { if ($retries < 4) { print "\n"; print "Failure with code " . $tmhOAuth->response['response']['code'] . "\n"; var_dump($tmhOAuth->response['response']['info']); var_dump($tmhOAuth->response['response']['error']); var_dump($tmhOAuth->response['response']['errno']); print "The above error may not be permanent. We will sleep and retry the request.\n"; sleep(7); $i--; // rewind $retries++; $reset_connection = true; } else { print "\n"; print "Permanent error when querying the Twitter API. Please investigate the error output. Now stopping.\n"; exit(1); } } } if ($reset_connection) { $tmhOAuth = new tmhOAuth(array('consumer_key' => $twitter_keys[$current_key]['twitter_consumer_key'], 'consumer_secret' => $twitter_keys[$current_key]['twitter_consumer_secret'], 'token' => $twitter_keys[$current_key]['twitter_user_token'], 'secret' => $twitter_keys[$current_key]['twitter_user_secret'])); $reset_connection = false; } else { $tweetQueue->insertDB(); } } }