示例#1
0
function processtweets($capturebucket)
{
    global $tweetQueue;
    $querybins = getActiveBins();
    // cache bin types
    $bintypes = array();
    foreach ($querybins as $binname => $queries) {
        $bintypes[$binname] = getBinType($binname);
    }
    // running through every single tweet
    foreach ($capturebucket as $data) {
        if (!array_key_exists('entities', $data)) {
            // unexpected/irregular tweet data
            if (array_key_exists('delete', $data)) {
                // a tweet has been deleted. @todo: process
                continue;
            }
            // this can get very verbose when repeated?
            logit(CAPTURE . ".error.log", "irregular tweet data received.");
            continue;
        }
        // we run through every bin to check whether the received tweets fit
        foreach ($querybins as $binname => $queries) {
            $geobin = isset($bintypes[$binname]) && $bintypes[$binname] == 'geotrack';
            if ($geobin && (!array_key_exists('geo_enabled', $data['user']) || $data['user']['geo_enabled'] !== true)) {
                // in geobins, process only geo tweets
                continue;
            }
            $found = false;
            if (CAPTURE == "track") {
                // we check for every query in the bin if they fit
                foreach ($queries as $query => $track) {
                    if ($geobin) {
                        $boxes = getGeoBoxes($track);
                        // look for geolocation matches
                        /*
                         * Some notes on geolocation tracking
                         *
                         * Geolocation tracking is done inside the capture role: track
                         * Geolocation query bins have a special type: geotrack
                         * Geolocation phrases have a specific format: 
                         *             = these phrases are a chain of geoboxes defined as 4 comma separated values (sw long, sw lat, ne long, ne lat)
                         *             = multiple world areas can thus be defined per bin
                         *
                         * Fetching (from Twitter)
                         *
                         * 1) Twitter will give us all the tweets which have excplicit GPS coordinates inside one of our queried areas.
                         * 2) Additionaly Twitter give us those tweets with a user 'place' definition. A place (i.e. Paris) is itself a (set of) gps polygons
                         *    Twitter returns the tweets if one of these place polygons covers the same area as our geo boxes.  
                         *
                         * And matching (by us)
                         *
                         * 1) These tweets will be put in the bin if the coordinate pair (longitude, latitude) fits in any one of the defined geoboxes in the bin.
                         * 2) These tweets will be put in the bin if the geobox is _not_ completely subsumed by the place (example: the place is France and the geobox is Paris), but the geobox does overlap the place polygon or the geobox subsumes the place polygon.
                         *
                         */
                        if ($data["geo"] != null) {
                            $tweet_lat = $data["geo"]["coordinates"][0];
                            $tweet_lng = $data["geo"]["coordinates"][1];
                            // does the tweet geo data fit in on of the boxes?
                            foreach ($boxes as $box) {
                                if (coordinatesInsideBoundingBox($tweet_lng, $tweet_lat, $box['sw_lng'], $box['sw_lat'], $box['ne_lng'], $box['ne_lat'])) {
                                    // logit(CAPTURE . ".error.log", "(debug) tweet with lng $tweet_lng and lat $tweet_lat versus (sw: " . $box['sw_lng'] . "," . $box['sw_lat'] . " ne: " . $box['ne_lng'] . "," . $box['ne_lat'] . ") matched to be inside the area");
                                    $found = true;
                                    break;
                                } else {
                                    // logit(CAPTURE . ".error.log", "(debug) tweet with lng $tweet_lng and lat $tweet_lat versus (sw: " . $box['sw_lng'] . "," . $box['sw_lat'] . " ne: " . $box['ne_lng'] . "," . $box['ne_lat'] . ") falls outside the area");
                                }
                            }
                        } else {
                            // this is a gps tracking query, but the tweet has no gps geo data
                            // Twitter may have matched this tweet based on the user-defined location data
                            if (array_key_exists('place', $data) && is_array($data['place']) && array_key_exists('bounding_box', $data['place'])) {
                                // Make a geoPHP object of the polygon(s) defining the place, by using a WKT (well-known text) string
                                $wkt = 'POLYGON(';
                                $polfirst = true;
                                foreach ($data['place']['bounding_box']['coordinates'] as $p => $pol) {
                                    if ($polfirst) {
                                        $polfirst = false;
                                    } else {
                                        $wkt .= ', ';
                                    }
                                    $wkt .= '(';
                                    $first = true;
                                    $first_lng = 0;
                                    $first_lat = 0;
                                    foreach ($data['place']['bounding_box']['coordinates'][$p] as $i => $coords) {
                                        $point_lng = $coords[0];
                                        $point_lat = $coords[1];
                                        if ($first) {
                                            $first = false;
                                            $first_lng = $point_lng;
                                            $first_lat = $point_lat;
                                        } else {
                                            $wkt .= ', ';
                                        }
                                        $wkt .= $point_lng . ' ' . $point_lat;
                                    }
                                    // end where we started
                                    $wkt .= ', ' . $first_lng . ' ' . $first_lat;
                                    $wkt .= ')';
                                }
                                $wkt .= ')';
                                $place = geoPHP::load($wkt, 'wkt');
                                // iterate over geoboxes in our track
                                // place should not spatially contain our box, but it should overlap with it
                                foreach ($boxes as $box) {
                                    // 'POLYGON((x1 y1, x1 y2, x2 y2, x2 y1, x1 y1))'
                                    $boxwkt = 'POLYGON((' . $box['sw_lng'] . ' ' . $box['sw_lat'] . ', ' . $box['sw_lng'] . ' ' . $box['ne_lat'] . ', ' . $box['ne_lng'] . ' ' . $box['ne_lat'] . ', ' . $box['ne_lng'] . ' ' . $box['sw_lat'] . ', ' . $box['sw_lng'] . ' ' . $box['sw_lat'] . '))';
                                    $versus = geoPHP::load($boxwkt, 'wkt');
                                    $contains = $place->contains($versus);
                                    $boxcontains = $versus->contains($place);
                                    $overlaps = $place->overlaps($versus);
                                    if (!$contains && ($boxcontains || $overlaps)) {
                                        // logit(CAPTURE . ".error.log", "place polygon $wkt allies with geobox $boxwkt");
                                        $found = true;
                                        break;
                                    }
                                }
                            }
                        }
                        if ($found) {
                            break;
                        }
                    } else {
                        // look for keyword matches
                        $pass = false;
                        // check for queries with more than one word, but go around quoted queries
                        if (preg_match("/ /", $query) && !preg_match("/'/", $query)) {
                            $tmplist = explode(" ", $query);
                            $all = true;
                            foreach ($tmplist as $tmp) {
                                if (!preg_match("/" . $tmp . "/i", $data["text"])) {
                                    $all = false;
                                    break;
                                }
                            }
                            // only if all words are found
                            if ($all == true) {
                                $pass = true;
                            }
                        } else {
                            // treat quoted queries as single words
                            $query = preg_replace("/'/", "", $query);
                            if (preg_match("/" . $query . "/i", $data["text"])) {
                                $pass = true;
                            }
                        }
                        // at the first fitting query, we break
                        if ($pass == true) {
                            $found = true;
                            break;
                        }
                    }
                }
            } elseif (CAPTURE == "follow") {
                // we check for every query in the bin if they fit
                $found = in_array($data["user"]["id"], $queries) ? TRUE : FALSE;
            } elseif (CAPTURE == "onepercent") {
                // always match in onepercent
                $found = true;
            }
            // if the tweet does not fit in the current bin, go to the next tweet
            if ($found == false) {
                continue;
            }
            $tweet = new Tweet();
            $tweet->fromJSON($data);
            $tweetQueue->push($tweet, $binname);
        }
    }
    $tweetQueue->insertDB();
    return TRUE;
}
示例#2
0
while ($line = fgets($fh)) {
    if (preg_match("/^-- Table structure for table `(.*)_tweets`/", $line, $matches)) {
        array_push($queryBins, $matches[1]);
    }
    if (preg_match("/^INSERT INTO tcat_query_bins \\( querybin, `type`, active, access \\) values \\( '(.*?)',/", $line, $matches)) {
        array_push($queryBins, $matches[1]);
    }
}
fclose($fh);
$queryBins = array_unique($queryBins);
if (count($queryBins) == 0) {
    die("I did not recognize '{$file}' as a TCAT export.\n");
}
$binsExist = false;
foreach ($queryBins as $bin) {
    if (getBinType($bin) === false) {
        print "Query bin: {$bin}\n";
    } else {
        print "Query bin already exists: {$bin}\n";
        $binsExist = true;
    }
}
if ($binsExist) {
    print "Error: query bin(s) already exist. Will not overwrite.\n";
    die("You may want to rename the existing query bin through the TCAT administration panel.\n");
}
print "Now importing...\n";
/* Convince system commands to use UTF-8 encoding */
setlocale(LC_ALL, 'en_US.UTF-8');
putenv('LC_ALL=en_US.UTF-8');
putenv('LANG=en_US.UTF-8');
示例#3
0
$bin_gzip = get_executable("gzip");
if ($bin_gzip === null) {
    die("The gzip binary appears to be missing. Please lookup this utility in your software repository.\n");
}
/* Convince system commands to use UTF-8 encoding */
setlocale(LC_ALL, 'en_US.UTF-8');
putenv('LC_ALL=en_US.UTF-8');
putenv('LANG=en_US.UTF-8');
putenv('LANGUAGE=en_US.UTF-8');
putenv('MYSQL_PWD=' . $dbpass);
/* this avoids having to put the password on the command-line */
$bin = $argv[1];
if (!isset($bin)) {
    die("Please specify a bin name.\n");
}
$bintype = getBinType($bin);
if ($bintype === false) {
    die("The query bin '{$bin}' could not be found!\n");
}
switch ($argv[2]) {
    case "structure":
        $export = 'queries';
        break;
    case "all":
        $export = 'all';
        break;
    default:
        die("Unrecognized export option.\n");
        break;
}
$binforfile = escapeshellcmd($bin);
示例#4
0
function reduce_gap_size($type, $start, $end)
{
    global $all_bins;
    $dbh = pdo_connect();
    $shrunk_start = $start;
    $shrunk_end = $end;
    $sql = "create temporary table gap_searcher ( measurement datetime primary key )";
    $rec = $dbh->prepare($sql);
    $rec->execute();
    foreach ($all_bins as $bin) {
        // Filter to only consider bins with the tracking role under consideration
        $bintype = getBinType($bin, $dbh);
        if ($bintype == 'geotrack') {
            $bintype = 'track';
        }
        if ($bintype != $type) {
            continue;
        }
        // This SQL query performs an explicit cast to handle the problems with created_at and timezones described here https://github.com/digitalmethodsinitiative/dmi-tcat/issues/197
        // We compare it with the dates we have in the gap table, which is the date specified by config.php
        $sql = "insert ignore into gap_searcher select created_at from {$bin}" . "_tweets\n                       where created_at > '{$start}' and created_at < '{$end}'";
        $rec = $dbh->prepare($sql);
        $rec->execute();
    }
    $sql = "select measurement from gap_searcher order by measurement asc";
    $rec = $dbh->prepare($sql);
    $rec->execute();
    $date_previous = null;
    $biggest_gap = -1;
    $biggest_gap_start = $biggest_gap_end = null;
    while ($row = $rec->fetch(PDO::FETCH_ASSOC)) {
        $date = $row['measurement'];
        if (is_null($date_previous)) {
            $date_previous = $date;
            continue;
        }
        $sql2 = "select timediff('{$date}', '{$date_previous}') as gap_size";
        $rec2 = $dbh->prepare($sql2);
        $rec2->execute();
        $gap_size = null;
        while ($row2 = $rec2->fetch(PDO::FETCH_ASSOC)) {
            if (isset($row2['gap_size'])) {
                $gap_size = $row2['gap_size'];
            }
        }
        if ($gap_size) {
            if (preg_match("/^(\\d{2}):(\\d{2}):(\\d{2})\$/", $gap_size, $matches)) {
                $hours = intval($matches[1]);
                $minutes = intval($matches[2]);
                $seconds = intval($matches[3]);
                $gap_in_seconds = $seconds + $minutes * 60 + $hours * 3600;
                if (!defined('IDLETIME')) {
                    define('IDLETIME', 600);
                }
                if (!defined('IDLETIME_FOLLOW')) {
                    define('IDLETIME_FOLLOW', IDLETIME);
                }
                // As per controller behaviour, we do not consider this a gap.
                if ($type == 'follow' && $gap_in_seconds < IDLETIME_FOLLOW || $type != 'follow' && $gap_in_seconds < IDLETIME) {
                    // As per controller behaviour, we do not consider this a gap.
                    continue;
                }
                if ($gap_in_seconds > $biggest_gap) {
                    $biggest_gap = $gap_in_seconds;
                    $biggest_gap_start = $date_previous;
                    $biggest_gap_end = $date;
                }
            }
        }
        $date_previous = $date;
    }
    if ($biggest_gap !== -1) {
        $shrunk_start = $biggest_gap_start;
        $shrunk_end = $biggest_gap_end;
    }
    if ($biggest_gap == 1) {
        // This is a situation where there doesn't appear to be a real data gap
        return null;
    }
    $sql = "drop table gap_searcher";
    $rec = $dbh->prepare($sql);
    $rec->execute();
    $dbh = null;
    return array('shrunk_start' => $shrunk_start, 'shrunk_end' => $shrunk_end);
}