예제 #1
0
function process_json_file_timeline($filepath, $dbh)
{
    global $tweets_processed, $tweets_failed, $tweets_success, $valid_timeline, $empty_timeline, $invalid_timeline, $populated_timeline, $total_timeline, $all_tweet_ids, $all_users, $bin_name;
    $tweetQueue = new TweetQueue();
    $total_timeline++;
    ini_set('auto_detect_line_endings', true);
    $handle = @fopen($filepath, "r");
    if ($handle) {
        while (($buffer = fgets($handle, 40960)) !== false) {
            $tweet = json_decode($buffer, true);
            //var_export($tweet); print "\n\n";
            $buffer = "";
            $t = new Tweet();
            $t->fromJSON($tweet);
            if (!$t->isInBin($bin_name)) {
                $tweetQueue->push($t, $bin_name);
                if ($tweetQueue->length() > 100) {
                    $tweetQueue->insertDB();
                }
                $all_users[] = $t->from_user_id;
                $all_tweet_ids[] = $t->id;
                $tweets_processed++;
            }
            print ".";
        }
        if (!feof($handle)) {
            echo "Error: unexpected fgets() fail\n";
        }
        fclose($handle);
    }
    if ($tweetQueue->length() > 0) {
        $tweetQueue->insertDB();
    }
}
예제 #2
0
function search($idlist)
{
    global $twitter_keys, $current_key, $all_users, $all_tweet_ids, $bin_name, $dbh, $tweetQueue;
    $keyinfo = getRESTKey(0);
    $current_key = $keyinfo['key'];
    $ratefree = $keyinfo['remaining'];
    print "current key {$current_key} ratefree {$ratefree}\n";
    $tmhOAuth = new tmhOAuth(array('consumer_key' => $twitter_keys[$current_key]['twitter_consumer_key'], 'consumer_secret' => $twitter_keys[$current_key]['twitter_consumer_secret'], 'token' => $twitter_keys[$current_key]['twitter_user_token'], 'secret' => $twitter_keys[$current_key]['twitter_user_secret']));
    // by hundred
    for ($i = 0; $i < sizeof($idlist); $i += 100) {
        if ($ratefree <= 0 || $ratefree % 10 == 0) {
            $keyinfo = getRESTKey($current_key);
            $current_key = $keyinfo['key'];
            $ratefree = $keyinfo['remaining'];
            $tmhOAuth = new tmhOAuth(array('consumer_key' => $twitter_keys[$current_key]['twitter_consumer_key'], 'consumer_secret' => $twitter_keys[$current_key]['twitter_consumer_secret'], 'token' => $twitter_keys[$current_key]['twitter_user_token'], 'secret' => $twitter_keys[$current_key]['twitter_user_secret']));
        }
        $q = $idlist[$i];
        $n = $i + 1;
        while ($n < $i + 100) {
            if (!isset($idlist[$n])) {
                break;
            }
            $q .= "," . $idlist[$n];
            $n++;
        }
        $params = array('id' => $q);
        $code = $tmhOAuth->user_request(array('method' => 'GET', 'url' => $tmhOAuth->url('1.1/statuses/lookup'), 'params' => $params));
        $ratefree--;
        if ($tmhOAuth->response['code'] == 200) {
            $data = json_decode($tmhOAuth->response['response'], true);
            if (is_array($data) && empty($data)) {
                // all tweets in set are deleted
                continue;
            }
            $tweets = $data;
            $tweet_ids = array();
            foreach ($tweets as $tweet) {
                $t = new Tweet();
                $t->fromJSON($tweet);
                if (!$t->isInBin($bin_name)) {
                    $all_users[] = $t->from_user_id;
                    $all_tweet_ids[] = $t->id;
                    $tweet_ids[] = $t->id;
                    $tweetQueue->push($t, $bin_name);
                }
                print ".";
            }
            sleep(1);
        } else {
            echo "Failure with code " . $tmhOAuth->response['response']['code'] . "\n";
            var_dump($tmhOAuth->response['response']['info']);
            var_dump($tmhOAuth->response['response']['error']);
            var_dump($tmhOAuth->response['response']['errno']);
            die;
        }
        $tweetQueue->insertDB();
    }
}
예제 #3
0
function search($keywords, $max_id = null)
{
    global $twitter_keys, $current_key, $ratefree, $bin_name, $dbh, $tweetQueue;
    $ratefree--;
    if ($ratefree < 1 || $ratefree % 10 == 0) {
        $keyinfo = getRESTKey($current_key, 'search', 'tweets');
        $current_key = $keyinfo['key'];
        $ratefree = $keyinfo['remaining'];
    }
    $tmhOAuth = new tmhOAuth(array('consumer_key' => $twitter_keys[$current_key]['twitter_consumer_key'], 'consumer_secret' => $twitter_keys[$current_key]['twitter_consumer_secret'], 'token' => $twitter_keys[$current_key]['twitter_user_token'], 'secret' => $twitter_keys[$current_key]['twitter_user_secret']));
    $params = array('q' => $keywords, 'count' => 100);
    if (isset($max_id)) {
        $params['max_id'] = $max_id;
    }
    $code = $tmhOAuth->user_request(array('method' => 'GET', 'url' => $tmhOAuth->url('1.1/search/tweets'), 'params' => $params));
    if ($tmhOAuth->response['code'] == 200) {
        $data = json_decode($tmhOAuth->response['response'], true);
        $tweets = $data['statuses'];
        $tweet_ids = array();
        foreach ($tweets as $tweet) {
            $t = new Tweet();
            $t->fromJSON($tweet);
            $tweet_ids[] = $t->id;
            if (!$t->isInBin($bin_name)) {
                $tweetQueue->push($t, $bin_name);
                if ($tweetQueue->length() > 100) {
                    $tweetQueue->insertDB();
                }
                print ".";
            }
        }
        if (!empty($tweet_ids)) {
            print "\n";
            if (count($tweet_ids) <= 1) {
                print "no more tweets found\n\n";
                return false;
            }
            $max_id = min($tweet_ids);
            print "max id: " . $max_id . "\n";
        } else {
            print "0 tweets found\n\n";
            return false;
        }
        sleep(1);
        search($keywords, $max_id);
    } else {
        echo $tmhOAuth->response['response'] . "\n";
        if ($tmhOAuth->response['response']['errors']['code'] == 130) {
            // over capacity
            sleep(1);
            search($keywords, $max_id);
        }
    }
}
예제 #4
0
function process_json_file_timeline($filepath, $dbh)
{
    global $tweets_processed, $tweets_failed, $tweets_success, $valid_timeline, $empty_timeline, $invalid_timeline, $populated_timeline, $total_timeline, $all_tweet_ids, $all_users, $bin_name;
    $tweetQueue = new TweetQueue();
    $total_timeline++;
    $filestr = file_get_contents($filepath);
    // sylvester stores multiple json exports in the same file,
    // in order to decode it we will need to split it into its respective individual exports
    $jsons = explode("}][{", $filestr);
    print count($jsons) . " jsons found\n";
    foreach ($jsons as $json) {
        if (substr($json, 0, 2) != "[{") {
            $json = "[{" . $json;
        }
        if (substr($json, -2) != "}]") {
            $json = $json . "}]";
        }
        $timeline = json_decode($json);
        if (is_array($timeline)) {
            $valid_timeline++;
            if (!empty($timeline)) {
                $populated_timeline++;
            } else {
                $empty_timeline++;
            }
        } else {
            $invalid_timeline++;
        }
        foreach ($timeline as $tweet) {
            $t = new Tweet();
            $t->fromJSON($tweet);
            if (!$t->isInBin($bin_name)) {
                $tweetQueue->push($t, $bin_name);
                if ($tweetQueue->length() > 100) {
                    $tweetQueue->insertDB();
                }
                $all_users[] = $t->user->id;
                $all_tweet_ids[] = $t->id;
                $tweets_processed++;
            }
        }
    }
    if ($tweetQueue->length() > 0) {
        $tweetQueue->insertDB();
    }
}
예제 #5
0
function processtweets($capturebucket)
{
    global $tweetQueue;
    $querybins = getActiveBins();
    // cache bin types
    $bintypes = array();
    foreach ($querybins as $binname => $queries) {
        $bintypes[$binname] = getBinType($binname);
    }
    // running through every single tweet
    foreach ($capturebucket as $data) {
        if (!array_key_exists('entities', $data)) {
            // unexpected/irregular tweet data
            if (array_key_exists('delete', $data)) {
                // a tweet has been deleted. @todo: process
                continue;
            }
            // this can get very verbose when repeated?
            logit(CAPTURE . ".error.log", "irregular tweet data received.");
            continue;
        }
        // we run through every bin to check whether the received tweets fit
        foreach ($querybins as $binname => $queries) {
            $geobin = isset($bintypes[$binname]) && $bintypes[$binname] == 'geotrack';
            if ($geobin && (!array_key_exists('geo_enabled', $data['user']) || $data['user']['geo_enabled'] !== true)) {
                // in geobins, process only geo tweets
                continue;
            }
            $found = false;
            if (CAPTURE == "track") {
                // we check for every query in the bin if they fit
                foreach ($queries as $query => $track) {
                    if ($geobin) {
                        $boxes = getGeoBoxes($track);
                        // look for geolocation matches
                        /*
                         * Some notes on geolocation tracking
                         *
                         * Geolocation tracking is done inside the capture role: track
                         * Geolocation query bins have a special type: geotrack
                         * Geolocation phrases have a specific format: 
                         *             = these phrases are a chain of geoboxes defined as 4 comma separated values (sw long, sw lat, ne long, ne lat)
                         *             = multiple world areas can thus be defined per bin
                         *
                         * Fetching (from Twitter)
                         *
                         * 1) Twitter will give us all the tweets which have excplicit GPS coordinates inside one of our queried areas.
                         * 2) Additionaly Twitter give us those tweets with a user 'place' definition. A place (i.e. Paris) is itself a (set of) gps polygons
                         *    Twitter returns the tweets if one of these place polygons covers the same area as our geo boxes.  
                         *
                         * And matching (by us)
                         *
                         * 1) These tweets will be put in the bin if the coordinate pair (longitude, latitude) fits in any one of the defined geoboxes in the bin.
                         * 2) These tweets will be put in the bin if the geobox is _not_ completely subsumed by the place (example: the place is France and the geobox is Paris), but the geobox does overlap the place polygon or the geobox subsumes the place polygon.
                         *
                         */
                        if ($data["geo"] != null) {
                            $tweet_lat = $data["geo"]["coordinates"][0];
                            $tweet_lng = $data["geo"]["coordinates"][1];
                            // does the tweet geo data fit in on of the boxes?
                            foreach ($boxes as $box) {
                                if (coordinatesInsideBoundingBox($tweet_lng, $tweet_lat, $box['sw_lng'], $box['sw_lat'], $box['ne_lng'], $box['ne_lat'])) {
                                    // logit(CAPTURE . ".error.log", "(debug) tweet with lng $tweet_lng and lat $tweet_lat versus (sw: " . $box['sw_lng'] . "," . $box['sw_lat'] . " ne: " . $box['ne_lng'] . "," . $box['ne_lat'] . ") matched to be inside the area");
                                    $found = true;
                                    break;
                                } else {
                                    // logit(CAPTURE . ".error.log", "(debug) tweet with lng $tweet_lng and lat $tweet_lat versus (sw: " . $box['sw_lng'] . "," . $box['sw_lat'] . " ne: " . $box['ne_lng'] . "," . $box['ne_lat'] . ") falls outside the area");
                                }
                            }
                        } else {
                            // this is a gps tracking query, but the tweet has no gps geo data
                            // Twitter may have matched this tweet based on the user-defined location data
                            if (array_key_exists('place', $data) && is_array($data['place']) && array_key_exists('bounding_box', $data['place'])) {
                                // Make a geoPHP object of the polygon(s) defining the place, by using a WKT (well-known text) string
                                $wkt = 'POLYGON(';
                                $polfirst = true;
                                foreach ($data['place']['bounding_box']['coordinates'] as $p => $pol) {
                                    if ($polfirst) {
                                        $polfirst = false;
                                    } else {
                                        $wkt .= ', ';
                                    }
                                    $wkt .= '(';
                                    $first = true;
                                    $first_lng = 0;
                                    $first_lat = 0;
                                    foreach ($data['place']['bounding_box']['coordinates'][$p] as $i => $coords) {
                                        $point_lng = $coords[0];
                                        $point_lat = $coords[1];
                                        if ($first) {
                                            $first = false;
                                            $first_lng = $point_lng;
                                            $first_lat = $point_lat;
                                        } else {
                                            $wkt .= ', ';
                                        }
                                        $wkt .= $point_lng . ' ' . $point_lat;
                                    }
                                    // end where we started
                                    $wkt .= ', ' . $first_lng . ' ' . $first_lat;
                                    $wkt .= ')';
                                }
                                $wkt .= ')';
                                $place = geoPHP::load($wkt, 'wkt');
                                // iterate over geoboxes in our track
                                // place should not spatially contain our box, but it should overlap with it
                                foreach ($boxes as $box) {
                                    // 'POLYGON((x1 y1, x1 y2, x2 y2, x2 y1, x1 y1))'
                                    $boxwkt = 'POLYGON((' . $box['sw_lng'] . ' ' . $box['sw_lat'] . ', ' . $box['sw_lng'] . ' ' . $box['ne_lat'] . ', ' . $box['ne_lng'] . ' ' . $box['ne_lat'] . ', ' . $box['ne_lng'] . ' ' . $box['sw_lat'] . ', ' . $box['sw_lng'] . ' ' . $box['sw_lat'] . '))';
                                    $versus = geoPHP::load($boxwkt, 'wkt');
                                    $contains = $place->contains($versus);
                                    $boxcontains = $versus->contains($place);
                                    $overlaps = $place->overlaps($versus);
                                    if (!$contains && ($boxcontains || $overlaps)) {
                                        // logit(CAPTURE . ".error.log", "place polygon $wkt allies with geobox $boxwkt");
                                        $found = true;
                                        break;
                                    }
                                }
                            }
                        }
                        if ($found) {
                            break;
                        }
                    } else {
                        // look for keyword matches
                        $pass = false;
                        // check for queries with more than one word, but go around quoted queries
                        if (preg_match("/ /", $query) && !preg_match("/'/", $query)) {
                            $tmplist = explode(" ", $query);
                            $all = true;
                            foreach ($tmplist as $tmp) {
                                if (!preg_match("/" . $tmp . "/i", $data["text"])) {
                                    $all = false;
                                    break;
                                }
                            }
                            // only if all words are found
                            if ($all == true) {
                                $pass = true;
                            }
                        } else {
                            // treat quoted queries as single words
                            $query = preg_replace("/'/", "", $query);
                            if (preg_match("/" . $query . "/i", $data["text"])) {
                                $pass = true;
                            }
                        }
                        // at the first fitting query, we break
                        if ($pass == true) {
                            $found = true;
                            break;
                        }
                    }
                }
            } elseif (CAPTURE == "follow") {
                // we check for every query in the bin if they fit
                $found = in_array($data["user"]["id"], $queries) ? TRUE : FALSE;
            } elseif (CAPTURE == "onepercent") {
                // always match in onepercent
                $found = true;
            }
            // if the tweet does not fit in the current bin, go to the next tweet
            if ($found == false) {
                continue;
            }
            $tweet = new Tweet();
            $tweet->fromJSON($data);
            $tweetQueue->push($tweet, $binname);
        }
    }
    $tweetQueue->insertDB();
    return TRUE;
}
예제 #6
0
function get_timeline($user_id, $type, $max_id = null)
{
    print "doing {$user_id}\n";
    global $twitter_keys, $current_key, $ratefree, $looped, $bin_name, $dbh, $tweetQueue;
    $ratefree--;
    if ($ratefree < 1 || $ratefree % 10 == 0) {
        $keyinfo = getRESTKey($current_key, 'statuses', 'user_timeline');
        $current_key = $keyinfo['key'];
        $ratefree = $keyinfo['remaining'];
    }
    $tmhOAuth = new tmhOAuth(array('consumer_key' => $twitter_keys[$current_key]['twitter_consumer_key'], 'consumer_secret' => $twitter_keys[$current_key]['twitter_consumer_secret'], 'token' => $twitter_keys[$current_key]['twitter_user_token'], 'secret' => $twitter_keys[$current_key]['twitter_user_secret']));
    $params = array('count' => 200, 'trim_user' => false, 'exclude_replies' => false, 'contributor_details' => true, 'include_rts' => 1);
    if ($type == "user_id") {
        $params['user_id'] = $user_id;
    } else {
        $params['screen_name'] = $user_id;
    }
    if (isset($max_id)) {
        $params['max_id'] = $max_id;
    }
    $tmhOAuth->user_request(array('method' => 'GET', 'url' => $tmhOAuth->url('1.1/statuses/user_timeline'), 'params' => $params));
    //var_export($params); print "\n";
    if ($tmhOAuth->response['code'] == 200) {
        $tweets = json_decode($tmhOAuth->response['response'], true);
        // store in db
        $tweet_ids = array();
        foreach ($tweets as $tweet) {
            $t = new Tweet();
            $t->fromJSON($tweet);
            $tweet_ids[] = $t->id;
            if (!$t->isInBin($bin_name)) {
                $tweetQueue->push($t, $bin_name);
                print ".";
                if ($tweetQueue->length() > 100) {
                    $tweetQueue->insertDB();
                }
            }
        }
        if (!empty($tweet_ids)) {
            print "\n";
            if (count($tweet_ids) <= 1) {
                print "no more tweets found\n\n";
                return false;
            }
            $max_id = min($tweet_ids);
            print "max id: " . $max_id . "\n";
        } else {
            print "0 tweets found\n\n";
            return false;
        }
        sleep(1);
        get_timeline($user_id, $type, $max_id);
    } else {
        $error_code = json_decode($tmhOAuth->response['response'])->errors[0]->code;
        if ($error_code == 130) {
            print "Twitter is over capacity, sleeping 5 seconds before retry\n";
            sleep(5);
            get_timeline($user_id, $type, $max_id);
        } elseif ($error_code == 88) {
            print "API key rate limit exceeded, sleeping 60 seconds before retry\n";
            sleep(60);
            get_timeline($user_id, $type, $max_id);
        } else {
            echo "\nAPI error: " . $tmhOAuth->response['response'] . "\n";
        }
    }
}
예제 #7
0
function search($idlist)
{
    global $twitter_keys, $current_key, $all_users, $all_tweet_ids, $bin_name, $dbh, $tweetQueue;
    $keyinfo = getRESTKey(0);
    $current_key = $keyinfo['key'];
    $ratefree = $keyinfo['remaining'];
    print "\ncurrent key {$current_key} ratefree {$ratefree}\n";
    $tmhOAuth = new tmhOAuth(array('consumer_key' => $twitter_keys[$current_key]['twitter_consumer_key'], 'consumer_secret' => $twitter_keys[$current_key]['twitter_consumer_secret'], 'token' => $twitter_keys[$current_key]['twitter_user_token'], 'secret' => $twitter_keys[$current_key]['twitter_user_secret']));
    // by hundred
    for ($i = 0; $i < sizeof($idlist); $i += 100) {
        if ($ratefree <= 0 || $ratefree % 10 == 0) {
            print "\n";
            $keyinfo = getRESTKey($current_key);
            $current_key = $keyinfo['key'];
            $ratefree = $keyinfo['remaining'];
            $tmhOAuth = new tmhOAuth(array('consumer_key' => $twitter_keys[$current_key]['twitter_consumer_key'], 'consumer_secret' => $twitter_keys[$current_key]['twitter_consumer_secret'], 'token' => $twitter_keys[$current_key]['twitter_user_token'], 'secret' => $twitter_keys[$current_key]['twitter_user_secret']));
        }
        $q = $idlist[$i];
        $n = $i + 1;
        while ($n < $i + 100) {
            if (!isset($idlist[$n])) {
                break;
            }
            $q .= "," . $idlist[$n];
            $n++;
        }
        $params = array('id' => $q);
        $code = $tmhOAuth->user_request(array('method' => 'GET', 'url' => $tmhOAuth->url('1.1/statuses/lookup'), 'params' => $params));
        $ratefree--;
        $reset_connection = false;
        if ($tmhOAuth->response['code'] == 200) {
            $data = json_decode($tmhOAuth->response['response'], true);
            if (is_array($data) && empty($data)) {
                // all tweets in set are deleted
                continue;
            }
            $tweets = $data;
            $tweet_ids = array();
            foreach ($tweets as $tweet) {
                $t = new Tweet();
                $t->fromJSON($tweet);
                if (!$t->isInBin($bin_name)) {
                    $all_users[] = $t->from_user_id;
                    $all_tweet_ids[] = $t->id;
                    $tweet_ids[] = $t->id;
                    $tweetQueue->push($t, $bin_name);
                }
                print ".";
            }
            sleep(1);
            $retries = 0;
            // reset retry counter on success
        } else {
            if ($retries < 4 && $tmhOAuth->response['code'] == 503) {
                /* this indicates problems on the Twitter side, such as overcapacity. we slow down and retry the connection */
                print "!";
                sleep(7);
                $i--;
                // rewind
                $retries++;
                $reset_connection = true;
            } else {
                if ($retries < 4) {
                    print "\n";
                    print "Failure with code " . $tmhOAuth->response['response']['code'] . "\n";
                    var_dump($tmhOAuth->response['response']['info']);
                    var_dump($tmhOAuth->response['response']['error']);
                    var_dump($tmhOAuth->response['response']['errno']);
                    print "The above error may not be permanent. We will sleep and retry the request.\n";
                    sleep(7);
                    $i--;
                    // rewind
                    $retries++;
                    $reset_connection = true;
                } else {
                    print "\n";
                    print "Permanent error when querying the Twitter API. Please investigate the error output. Now stopping.\n";
                    exit(1);
                }
            }
        }
        if ($reset_connection) {
            $tmhOAuth = new tmhOAuth(array('consumer_key' => $twitter_keys[$current_key]['twitter_consumer_key'], 'consumer_secret' => $twitter_keys[$current_key]['twitter_consumer_secret'], 'token' => $twitter_keys[$current_key]['twitter_user_token'], 'secret' => $twitter_keys[$current_key]['twitter_user_secret']));
            $reset_connection = false;
        } else {
            $tweetQueue->insertDB();
        }
    }
}