function processtweets($capturebucket) { global $tweetQueue; $querybins = getActiveBins(); // cache bin types $bintypes = array(); foreach ($querybins as $binname => $queries) { $bintypes[$binname] = getBinType($binname); } // running through every single tweet foreach ($capturebucket as $data) { if (!array_key_exists('entities', $data)) { // unexpected/irregular tweet data if (array_key_exists('delete', $data)) { // a tweet has been deleted. @todo: process continue; } // this can get very verbose when repeated? logit(CAPTURE . ".error.log", "irregular tweet data received."); continue; } // we run through every bin to check whether the received tweets fit foreach ($querybins as $binname => $queries) { $geobin = isset($bintypes[$binname]) && $bintypes[$binname] == 'geotrack'; if ($geobin && (!array_key_exists('geo_enabled', $data['user']) || $data['user']['geo_enabled'] !== true)) { // in geobins, process only geo tweets continue; } $found = false; if (CAPTURE == "track") { // we check for every query in the bin if they fit foreach ($queries as $query => $track) { if ($geobin) { $boxes = getGeoBoxes($track); // look for geolocation matches /* * Some notes on geolocation tracking * * Geolocation tracking is done inside the capture role: track * Geolocation query bins have a special type: geotrack * Geolocation phrases have a specific format: * = these phrases are a chain of geoboxes defined as 4 comma separated values (sw long, sw lat, ne long, ne lat) * = multiple world areas can thus be defined per bin * * Fetching (from Twitter) * * 1) Twitter will give us all the tweets which have excplicit GPS coordinates inside one of our queried areas. * 2) Additionaly Twitter give us those tweets with a user 'place' definition. A place (i.e. Paris) is itself a (set of) gps polygons * Twitter returns the tweets if one of these place polygons covers the same area as our geo boxes. * * And matching (by us) * * 1) These tweets will be put in the bin if the coordinate pair (longitude, latitude) fits in any one of the defined geoboxes in the bin. * 2) These tweets will be put in the bin if the geobox is _not_ completely subsumed by the place (example: the place is France and the geobox is Paris), but the geobox does overlap the place polygon or the geobox subsumes the place polygon. * */ if ($data["geo"] != null) { $tweet_lat = $data["geo"]["coordinates"][0]; $tweet_lng = $data["geo"]["coordinates"][1]; // does the tweet geo data fit in on of the boxes? foreach ($boxes as $box) { if (coordinatesInsideBoundingBox($tweet_lng, $tweet_lat, $box['sw_lng'], $box['sw_lat'], $box['ne_lng'], $box['ne_lat'])) { // logit(CAPTURE . ".error.log", "(debug) tweet with lng $tweet_lng and lat $tweet_lat versus (sw: " . $box['sw_lng'] . "," . $box['sw_lat'] . " ne: " . $box['ne_lng'] . "," . $box['ne_lat'] . ") matched to be inside the area"); $found = true; break; } else { // logit(CAPTURE . ".error.log", "(debug) tweet with lng $tweet_lng and lat $tweet_lat versus (sw: " . $box['sw_lng'] . "," . $box['sw_lat'] . " ne: " . $box['ne_lng'] . "," . $box['ne_lat'] . ") falls outside the area"); } } } else { // this is a gps tracking query, but the tweet has no gps geo data // Twitter may have matched this tweet based on the user-defined location data if (array_key_exists('place', $data) && is_array($data['place']) && array_key_exists('bounding_box', $data['place'])) { // Make a geoPHP object of the polygon(s) defining the place, by using a WKT (well-known text) string $wkt = 'POLYGON('; $polfirst = true; foreach ($data['place']['bounding_box']['coordinates'] as $p => $pol) { if ($polfirst) { $polfirst = false; } else { $wkt .= ', '; } $wkt .= '('; $first = true; $first_lng = 0; $first_lat = 0; foreach ($data['place']['bounding_box']['coordinates'][$p] as $i => $coords) { $point_lng = $coords[0]; $point_lat = $coords[1]; if ($first) { $first = false; $first_lng = $point_lng; $first_lat = $point_lat; } else { $wkt .= ', '; } $wkt .= $point_lng . ' ' . $point_lat; } // end where we started $wkt .= ', ' . $first_lng . ' ' . $first_lat; $wkt .= ')'; } $wkt .= ')'; $place = geoPHP::load($wkt, 'wkt'); // iterate over geoboxes in our track // place should not spatially contain our box, but it should overlap with it foreach ($boxes as $box) { // 'POLYGON((x1 y1, x1 y2, x2 y2, x2 y1, x1 y1))' $boxwkt = 'POLYGON((' . $box['sw_lng'] . ' ' . $box['sw_lat'] . ', ' . $box['sw_lng'] . ' ' . $box['ne_lat'] . ', ' . $box['ne_lng'] . ' ' . $box['ne_lat'] . ', ' . $box['ne_lng'] . ' ' . $box['sw_lat'] . ', ' . $box['sw_lng'] . ' ' . $box['sw_lat'] . '))'; $versus = geoPHP::load($boxwkt, 'wkt'); $contains = $place->contains($versus); $boxcontains = $versus->contains($place); $overlaps = $place->overlaps($versus); if (!$contains && ($boxcontains || $overlaps)) { // logit(CAPTURE . ".error.log", "place polygon $wkt allies with geobox $boxwkt"); $found = true; break; } } } } if ($found) { break; } } else { // look for keyword matches $pass = false; // check for queries with more than one word, but go around quoted queries if (preg_match("/ /", $query) && !preg_match("/'/", $query)) { $tmplist = explode(" ", $query); $all = true; foreach ($tmplist as $tmp) { if (!preg_match("/" . $tmp . "/i", $data["text"])) { $all = false; break; } } // only if all words are found if ($all == true) { $pass = true; } } else { // treat quoted queries as single words $query = preg_replace("/'/", "", $query); if (preg_match("/" . $query . "/i", $data["text"])) { $pass = true; } } // at the first fitting query, we break if ($pass == true) { $found = true; break; } } } } elseif (CAPTURE == "follow") { // we check for every query in the bin if they fit $found = in_array($data["user"]["id"], $queries) ? TRUE : FALSE; } elseif (CAPTURE == "onepercent") { // always match in onepercent $found = true; } // if the tweet does not fit in the current bin, go to the next tweet if ($found == false) { continue; } $tweet = new Tweet(); $tweet->fromJSON($data); $tweetQueue->push($tweet, $binname); } } $tweetQueue->insertDB(); return TRUE; }
while ($line = fgets($fh)) { if (preg_match("/^-- Table structure for table `(.*)_tweets`/", $line, $matches)) { array_push($queryBins, $matches[1]); } if (preg_match("/^INSERT INTO tcat_query_bins \\( querybin, `type`, active, access \\) values \\( '(.*?)',/", $line, $matches)) { array_push($queryBins, $matches[1]); } } fclose($fh); $queryBins = array_unique($queryBins); if (count($queryBins) == 0) { die("I did not recognize '{$file}' as a TCAT export.\n"); } $binsExist = false; foreach ($queryBins as $bin) { if (getBinType($bin) === false) { print "Query bin: {$bin}\n"; } else { print "Query bin already exists: {$bin}\n"; $binsExist = true; } } if ($binsExist) { print "Error: query bin(s) already exist. Will not overwrite.\n"; die("You may want to rename the existing query bin through the TCAT administration panel.\n"); } print "Now importing...\n"; /* Convince system commands to use UTF-8 encoding */ setlocale(LC_ALL, 'en_US.UTF-8'); putenv('LC_ALL=en_US.UTF-8'); putenv('LANG=en_US.UTF-8');
$bin_gzip = get_executable("gzip"); if ($bin_gzip === null) { die("The gzip binary appears to be missing. Please lookup this utility in your software repository.\n"); } /* Convince system commands to use UTF-8 encoding */ setlocale(LC_ALL, 'en_US.UTF-8'); putenv('LC_ALL=en_US.UTF-8'); putenv('LANG=en_US.UTF-8'); putenv('LANGUAGE=en_US.UTF-8'); putenv('MYSQL_PWD=' . $dbpass); /* this avoids having to put the password on the command-line */ $bin = $argv[1]; if (!isset($bin)) { die("Please specify a bin name.\n"); } $bintype = getBinType($bin); if ($bintype === false) { die("The query bin '{$bin}' could not be found!\n"); } switch ($argv[2]) { case "structure": $export = 'queries'; break; case "all": $export = 'all'; break; default: die("Unrecognized export option.\n"); break; } $binforfile = escapeshellcmd($bin);
function reduce_gap_size($type, $start, $end) { global $all_bins; $dbh = pdo_connect(); $shrunk_start = $start; $shrunk_end = $end; $sql = "create temporary table gap_searcher ( measurement datetime primary key )"; $rec = $dbh->prepare($sql); $rec->execute(); foreach ($all_bins as $bin) { // Filter to only consider bins with the tracking role under consideration $bintype = getBinType($bin, $dbh); if ($bintype == 'geotrack') { $bintype = 'track'; } if ($bintype != $type) { continue; } // This SQL query performs an explicit cast to handle the problems with created_at and timezones described here https://github.com/digitalmethodsinitiative/dmi-tcat/issues/197 // We compare it with the dates we have in the gap table, which is the date specified by config.php $sql = "insert ignore into gap_searcher select created_at from {$bin}" . "_tweets\n where created_at > '{$start}' and created_at < '{$end}'"; $rec = $dbh->prepare($sql); $rec->execute(); } $sql = "select measurement from gap_searcher order by measurement asc"; $rec = $dbh->prepare($sql); $rec->execute(); $date_previous = null; $biggest_gap = -1; $biggest_gap_start = $biggest_gap_end = null; while ($row = $rec->fetch(PDO::FETCH_ASSOC)) { $date = $row['measurement']; if (is_null($date_previous)) { $date_previous = $date; continue; } $sql2 = "select timediff('{$date}', '{$date_previous}') as gap_size"; $rec2 = $dbh->prepare($sql2); $rec2->execute(); $gap_size = null; while ($row2 = $rec2->fetch(PDO::FETCH_ASSOC)) { if (isset($row2['gap_size'])) { $gap_size = $row2['gap_size']; } } if ($gap_size) { if (preg_match("/^(\\d{2}):(\\d{2}):(\\d{2})\$/", $gap_size, $matches)) { $hours = intval($matches[1]); $minutes = intval($matches[2]); $seconds = intval($matches[3]); $gap_in_seconds = $seconds + $minutes * 60 + $hours * 3600; if (!defined('IDLETIME')) { define('IDLETIME', 600); } if (!defined('IDLETIME_FOLLOW')) { define('IDLETIME_FOLLOW', IDLETIME); } // As per controller behaviour, we do not consider this a gap. if ($type == 'follow' && $gap_in_seconds < IDLETIME_FOLLOW || $type != 'follow' && $gap_in_seconds < IDLETIME) { // As per controller behaviour, we do not consider this a gap. continue; } if ($gap_in_seconds > $biggest_gap) { $biggest_gap = $gap_in_seconds; $biggest_gap_start = $date_previous; $biggest_gap_end = $date; } } } $date_previous = $date; } if ($biggest_gap !== -1) { $shrunk_start = $biggest_gap_start; $shrunk_end = $biggest_gap_end; } if ($biggest_gap == 1) { // This is a situation where there doesn't appear to be a real data gap return null; } $sql = "drop table gap_searcher"; $rec = $dbh->prepare($sql); $rec->execute(); $dbh = null; return array('shrunk_start' => $shrunk_start, 'shrunk_end' => $shrunk_end); }