function getFlow($col1_type, $col1_cutoff, $col2_type, $col2_cutoff, $discard_other, $runcounter) { global $esc, $network, $oldtoplists, $collation; $sql = "SELECT LOWER(" . $col1_type . ") AS col1, LOWER(t." . $col2_type . ") AS col2 FROM "; $sql .= $esc['mysql']['dataset'] . "_tweets t "; $sql .= sqlSubset(); if ($col1_type == "hashtag") { $sql = "SELECT LOWER(h.text COLLATE {$collation}) AS col1, LOWER(t." . $col2_type . ") AS col2 FROM "; $sql .= $esc['mysql']['dataset'] . "_tweets t, " . $esc['mysql']['dataset'] . "_hashtags h "; $where = "t.id = h.tweet_id AND "; $sql .= sqlSubset($where); } if ($col2_type == "hashtag") { $sql = "SELECT LOWER(t." . $col1_type . ") AS col1,LOWER(h.text COLLATE {$collation}) AS col2 FROM "; $sql .= $esc['mysql']['dataset'] . "_tweets t, " . $esc['mysql']['dataset'] . "_hashtags h "; $where = "t.id = h.tweet_id AND "; $sql .= sqlSubset($where); } if ($col1_type == "domain") { $sql = "SELECT LOWER(u.domain) AS col1, LOWER(t." . $col2_type . ") AS col2 FROM "; $sql .= $esc['mysql']['dataset'] . "_tweets t, " . $esc['mysql']['dataset'] . "_urls u "; $where = "t.id = u.tweet_id AND "; $sql .= sqlSubset($where); } if ($col2_type == "domain") { $sql = "SELECT LOWER(t." . $col1_type . ") AS col1,LOWER(u.domain) AS col2 FROM "; $sql .= $esc['mysql']['dataset'] . "_tweets t, " . $esc['mysql']['dataset'] . "_urls u "; $where = "t.id = u.tweet_id AND "; $sql .= sqlSubset($where); } // voting for keeping the SQL output, nice way to trace results for expert users echo "sql query " . ($runcounter + 1) . ": " . $sql . "<br />"; $sqlresults = mysql_query($sql); // run through the data once to create item counts for cutting and fusing $data = array(); $toplists = array(); $toplists["col1"] = array(); $toplists["col2"] = array(); while ($res = mysql_fetch_assoc($sqlresults)) { // --------------------- // some cleaning $col1 = $res["col1"]; $col2 = $res["col2"]; if ($col1_type == "from_user_utcoffset") { $col1 .= "s"; } if ($col2_type == "from_user_utcoffset") { $col2 .= "s"; } if ($col1_type == "hashtag") { $col1 = "#" . $col1; } // also important to make sure that things don't have the same name (D3 sankey uses node names as ids) if ($col2_type == "hashtag") { $col2 = "#" . $col2; } $col1 = preg_replace("/<.+>/U", "", $col1); $col1 = preg_replace("/[ \\s\t]+/", " ", $col1); $col1 = trim($col1); if ($col1 == "") { $col1 = "empty"; } $col1 = addslashes($col1); $res["col1"] = $col1; $col2 = preg_replace("/<.+>/U", "", $col2); $col2 = preg_replace("/[ \\s\t]+/", " ", $col2); $col2 = trim($col2); $col2 = addslashes($col2); if ($col2 == "") { $col2 = "empty"; } $res["col2"] = $col2; // --------------------- // counting results if (!isset($toplists["col1"][$col1])) { $toplists["col1"][$col1] = 0; } $toplists["col1"][$col1]++; if (!isset($toplists["col2"][$col2])) { $toplists["col2"][$col2] = 0; } $toplists["col2"][$col2]++; $data[] = $res; } // --------------------- // cut off elements for the sankey column foreach ($toplists as $key => $list) { arsort($toplists[$key]); } if ($col2_cutoff != 0) { $toplists["col2"] = array_slice($toplists["col2"], 0, $col2_cutoff); } // if there are three columns, it's necessary to have same toplist for middle column because the two queries may have different results // first query specifies toplist for the second column if ($runcounter == 0) { if ($col1_cutoff != 0) { $toplists["col1"] = array_slice($toplists["col1"], 0, $col1_cutoff); } $oldtoplists = $toplists; } else { $oldtoplist = array_keys($oldtoplists["col2"]); $tmplist = array(); foreach ($toplists["col1"] as $topkey => $topvalue) { if (in_array($topkey, $oldtoplist)) { $tmplist[$topkey] = $topvalue; } } $toplists["col1"] = $tmplist; } // --------------------- // create the sankey network if (!isset($network)) { $network = array(); $network["nodes"] = array(); $network["links"] = array(); $translate = array(); } foreach ($data as $res) { $col1 = $res['col1']; $col2 = $res['col2']; if (!isset($toplists["col1"][$col1])) { if ($discard_other == "on") { continue; } $col1 = "other " . $col1_type; } if (!isset($toplists["col2"][$col2])) { if ($discard_other == "on") { continue; } $col2 = "other " . $col2_type; } if (!in_array($col1, $network["nodes"])) { $network["nodes"][] = $col1; } if (!in_array($col2, $network["nodes"])) { $network["nodes"][] = $col2; } $edge = $col1 . "_XXX_" . $col2; if (!isset($network["links"][$edge])) { $network["links"][$edge] = 0; } $network["links"][$edge]++; } }
</script> </head> <body> <h1>TCAT :: Export URLs</h1> <?php validate_all_variables(); $filename = get_filename_for_export('urlsExport'); $csv = new CSV($filename, $outputformat); $csv->writeheader(array('tweet_id', 'url', 'url_expanded', 'url_followed')); $sql = "SELECT t.id as id, u.url as url, u.url_expanded as url_expanded, u.url_followed as url_followed FROM " . $esc['mysql']['dataset'] . "_tweets t, " . $esc['mysql']['dataset'] . "_urls u "; $sql .= sqlSubset(); $sql .= " AND u.tweet_id = t.id ORDER BY id"; $sqlresults = mysql_unbuffered_query($sql); $out = ""; if ($sqlresults) { while ($data = mysql_fetch_assoc($sqlresults)) { $csv->newrow(); $csv->addfield($data['id'], 'integer'); $csv->addfield($data['url'], 'string'); if (isset($data['url_followed']) && strlen($data['url_followed']) > 1) { $csv->addfield($data['url'], 'string'); } else { $csv->addfield('', 'string'); } if (isset($data['url_expanded']) && strlen($data['url_expanded']) > 1) { $csv->addfield($data['url_expanded'], 'string');
<h1>TCAT :: Mention graph</h1> <?php validate_all_variables(); $collation = current_collation(); $users = array(); $usersinv = array(); $edges = array(); $cur = 0; $numresults = 500000; //print_r($esc); exit; while ($numresults == 500000) { $sql = "SELECT m.from_user_name COLLATE {$collation} as from_user_name, m.to_user COLLATE {$collation} as to_user FROM " . $esc['mysql']['dataset'] . "_mentions m, " . $esc['mysql']['dataset'] . "_tweets t "; $where = "m.tweet_id = t.id AND "; $sql .= sqlSubset($where); $sql .= " LIMIT " . $cur . "," . $numresults; //print $sql."<br>"; $sqlresults = mysql_query($sql); while ($data = mysql_fetch_assoc($sqlresults)) { $data["from_user_name"] = strtolower($data["from_user_name"]); $data["to_user"] = strtolower($data["to_user"]); if (!isset($users[$data["from_user_name"]])) { $users[$data["from_user_name"]] = $arrayName = array('id' => count($usersinv), 'notweets' => 1, 'nomentions' => 0); $usersinv[] = $data["from_user_name"]; } else { $users[$data["from_user_name"]]["notweets"]++; } if (!isset($users[$data["to_user"]])) { $users[$data["to_user"]] = $arrayName = array('id' => count($usersinv), 'notweets' => 0, 'nomentions' => 1); $usersinv[] = $data["to_user"];
//print $sql . "<bR>"; $sqlresults = mysql_unbuffered_query($sql); while ($res = mysql_fetch_assoc($sqlresults)) { $word = $res['h1']; $coword->distinctUsersForWord[$word] = $res['d']; $coword->userDiversity[$word] = round($res['d'] / $res['c'] * 100, 2); $coword->wordFrequency[$word] = $res['c']; $coword->wordFrequencyDividedByUniqueUsers[$word] = round($res['c'] / $res['d'], 2); $coword->wordFrequencyMultipliedByUniqueUsers[$word] = $res['c'] * $res['d']; } mysql_free_result($sqlresults); // do the actual job // get cowords $sql = "SELECT LOWER(A.text COLLATE {$collation}) AS h1, LOWER(B.text COLLATE {$collation}) AS h2 "; $sql .= "FROM " . $esc['mysql']['dataset'] . "_hashtags A, " . $esc['mysql']['dataset'] . "_hashtags B, " . $esc['mysql']['dataset'] . "_tweets t "; $sql .= sqlSubset() . " AND "; $sql .= "LENGTH(A.text)>1 AND LENGTH(B.text)>1 AND "; $sql .= "LOWER(A.text COLLATE {$collation}) < LOWER(B.text COLLATE {$collation}) AND A.tweet_id = t.id AND A.tweet_id = B.tweet_id "; $sql .= "ORDER BY h1,h2"; //print $sql."<br>"; $sqlresults = mysql_query($sql); while ($res = mysql_fetch_assoc($sqlresults)) { $coword->addWord($res['h1']); $coword->addWord($res['h2']); $coword->addCoword($res['h1'], $res['h2'], 1); } unset($coword->words); // as we are adding words manually the frequency would be messed up if ($esc['shell']['minf'] > 1 && !($esc['shell']['topu'] > 0)) { $coword->applyMinFreq($esc['shell']['minf']); //$coword->applyMinDegree($esc['shell']['minf']); // Berno: method no longer in use, remains unharmed
function sentiment_avgs() { global $esc, $period; $avgs = array(); // all sentiments $sql = "SELECT avg(s.positive) as pos, avg(s.negative) as neg, "; if ($period == "day") { // @todo $sql .= "DATE_FORMAT(t.created_at,'%Y.%d.%m') datepart "; } else { $sql .= "DATE_FORMAT(t.created_at,'%d. %H:00h') datepart "; } $sql .= "FROM " . $esc['mysql']['dataset'] . "_tweets t, "; $sql .= $esc['mysql']['dataset'] . "_sentiment s "; $sql .= sqlSubset("t.id = s.tweet_id AND "); $sql .= "GROUP BY datepart ORDER BY t.created_at"; $rec = mysql_unbuffered_query($sql); while ($res = mysql_fetch_assoc($rec)) { $neg = $res['neg']; $pos = $res['pos']; $avgs[$res['datepart']][0] = (double) $pos; $avgs[$res['datepart']][1] = (double) abs($neg); } mysql_free_result($rec); // only subjective $sql = "SELECT avg(s.positive) as pos, avg(s.negative) as neg, "; if ($period == "day") { // @todo $sql .= "DATE_FORMAT(t.created_at,'%Y.%d.%m') datepart "; } else { $sql .= "DATE_FORMAT(t.created_at,'%d. %H:00h') datepart "; } $sql .= "FROM " . $esc['mysql']['dataset'] . "_tweets t, "; $sql .= $esc['mysql']['dataset'] . "_sentiment s "; $sql .= sqlSubset("t.id = s.tweet_id AND (s.positive != 1 AND s.negative != 1) AND "); $sql .= "GROUP BY datepart ORDER BY t.created_at"; $rec = mysql_unbuffered_query($sql); while ($res = mysql_fetch_assoc($rec)) { $neg = $res['neg']; $pos = $res['pos']; $avgs[$res['datepart']][2] = (double) $pos; $avgs[$res['datepart']][3] = (double) abs($neg); } mysql_free_result($rec); // only dateparts $sql = "SELECT "; if ($period == "day") { // @todo $sql .= "DATE_FORMAT(t.created_at,'%Y.%d.%m') datepart "; } else { $sql .= "DATE_FORMAT(t.created_at,'%d. %H:00h') datepart "; } $sql .= "FROM " . $esc['mysql']['dataset'] . "_tweets t "; $sql .= sqlSubset(); $sql .= "GROUP BY datepart"; // initialize with empty dates $curdate = strtotime($esc['datetime']['startdate']); while ($curdate < strtotime($esc['datetime']['enddate'])) { $thendate = $period == "day" ? $curdate + 86400 : $curdate + 3600; $tmp = $period == "day" ? strftime("%Y.%d.%m", $curdate) : strftime("%d. %H:%M", $curdate) . "h"; if (!isset($avgs[$tmp])) { $avgs[$tmp] = array(); } $curdate = $thendate; } return $avgs; }
function printTopHashtags() { global $esc; $collation = current_collation(); $results = array(); $sql = "SELECT COUNT(hashtags.text COLLATE {$collation}) AS count, LOWER(hashtags.text COLLATE {$collation}) AS toget "; $sql .= "FROM " . $esc['mysql']['dataset'] . "_hashtags hashtags, " . $esc['mysql']['dataset'] . "_tweets t "; $sql .= sqlSubset("t.id = hashtags.tweet_id AND "); $sql .= " GROUP BY toget ORDER BY count DESC limit 10"; //print $sql."<br>"; $rec = mysql_query($sql); $out = ""; while ($res = mysql_fetch_assoc($rec)) { $out .= $res['toget'] . " (" . $res['count'] . "), "; } print substr($out, 0, -2); }
</script> </head> <body> <h1>TCAT :: Sentiment Cloud</h1> <?php validate_all_variables(); $filename = get_filename_for_export("sentiment_cloud"); $csv = new CSV($filename, $outputformat); $sql = "SELECT s.explanation FROM " . $esc['mysql']['dataset'] . "_tweets t, " . $esc['mysql']['dataset'] . "_sentiment s "; $sql .= sqlSubset("s.tweet_id = t.id AND "); //print $sql . "<br>";die; $rec = mysql_unbuffered_query($sql); $negativeSentiments = $positiveSentiments = $wordValues = array(); while ($res = mysql_fetch_assoc($rec)) { if (preg_match_all("/[\\s|\\B]([\\p{L}\\w\\d_]+)\\[(-?\\d)\\]/u", $res['explanation'], $matches)) { foreach ($matches[1] as $k => $word) { $word = strtolower(trim($word)); $sentimentValue = (int) $matches[2][$k]; if ($sentimentValue < 0) { if (array_key_exists($word, $negativeSentiments) === false) { $negativeSentiments[$word] = 0; } $negativeSentiments[$word]++; } else { if (array_key_exists($word, $positiveSentiments) === false) {
die; } // make filename and open file for write if ($bin_type == "geotrack") { $module = "rateLimitDataGeo"; } else { $module = "ratelimitData"; } $module .= "-" . $esc['date']['interval']; $filename = get_filename_for_export($module); $csv = new CSV($filename, $outputformat); // write header $header = "querybin,datetime,tweets ratelimited (estimate)"; $csv->writeheader(explode(',', $header)); $sqlInterval = sqlInterval(); $sqlSubset = sqlSubset(); $sqlGroup = " GROUP BY datepart ASC"; // Use native MySQL to create a temporary table with all dateparts. They should be identical to the dateparts we will use in the GROUP BY statement. // Prepare the string mysql needs in date_add() $mysqlNativeInterval = "day"; // default $interval = daily switch ($esc['date']['interval']) { case "hourly": $mysqlNativeInterval = "hour"; break; case "daily": $mysqlNativeInterval = "day"; break; case "weekly": $mysqlNativeInterval = "week"; break;
function printTopHashtags() { global $esc, $method; $collation = current_collation(); $sql_interval = "DATE_FORMAT(t.created_at,'%Y-%m-%d') datepart "; if ($method == "word") { $sql = "SELECT COUNT(h.text COLLATE {$collation}) AS count, h.text COLLATE {$collation} AS toget, "; $sql .= $sql_interval; $sql .= "FROM " . $esc['mysql']['dataset'] . "_pos h, " . $esc['mysql']['dataset'] . "_tweets t "; $where = "t.id = h.tweet_id AND "; $sql .= sqlSubset($where); $sql .= " GROUP BY toget ORDER BY count DESC limit 10"; } else { $sql = "SELECT COUNT(hashtags.text COLLATE {$collation}) AS count, LOWER(hashtags.text COLLATE {$collation}) AS toget, "; $sql .= $sql_interval; $sql .= "FROM " . $esc['mysql']['dataset'] . "_hashtags hashtags, " . $esc['mysql']['dataset'] . "_tweets t "; $where = "t.id = hashtags.tweet_id AND "; $sql .= sqlSubset($where); $sql .= " GROUP BY toget ORDER BY count DESC limit 10"; } //print $sql . "<br>"; flush(); $rec = mysql_query($sql); $out = ""; while ($res = mysql_fetch_assoc($rec)) { //if ($res['count'] > $esc['shell']['minf']) $out .= $res['toget'] . " (" . $res['count'] . "), "; } print substr($out, 0, -2); }