Пример #1
0
function getFlow($col1_type, $col1_cutoff, $col2_type, $col2_cutoff, $discard_other, $runcounter)
{
    global $esc, $network, $oldtoplists, $collation;
    $sql = "SELECT LOWER(" . $col1_type . ") AS col1, LOWER(t." . $col2_type . ") AS col2 FROM ";
    $sql .= $esc['mysql']['dataset'] . "_tweets t ";
    $sql .= sqlSubset();
    if ($col1_type == "hashtag") {
        $sql = "SELECT LOWER(h.text COLLATE {$collation}) AS col1, LOWER(t." . $col2_type . ") AS col2 FROM ";
        $sql .= $esc['mysql']['dataset'] . "_tweets t, " . $esc['mysql']['dataset'] . "_hashtags h ";
        $where = "t.id = h.tweet_id AND ";
        $sql .= sqlSubset($where);
    }
    if ($col2_type == "hashtag") {
        $sql = "SELECT LOWER(t." . $col1_type . ") AS col1,LOWER(h.text COLLATE {$collation}) AS col2 FROM ";
        $sql .= $esc['mysql']['dataset'] . "_tweets t, " . $esc['mysql']['dataset'] . "_hashtags h ";
        $where = "t.id = h.tweet_id AND ";
        $sql .= sqlSubset($where);
    }
    if ($col1_type == "domain") {
        $sql = "SELECT LOWER(u.domain) AS col1, LOWER(t." . $col2_type . ") AS col2 FROM ";
        $sql .= $esc['mysql']['dataset'] . "_tweets t, " . $esc['mysql']['dataset'] . "_urls u ";
        $where = "t.id = u.tweet_id AND ";
        $sql .= sqlSubset($where);
    }
    if ($col2_type == "domain") {
        $sql = "SELECT LOWER(t." . $col1_type . ") AS col1,LOWER(u.domain) AS col2 FROM ";
        $sql .= $esc['mysql']['dataset'] . "_tweets t, " . $esc['mysql']['dataset'] . "_urls u ";
        $where = "t.id = u.tweet_id AND ";
        $sql .= sqlSubset($where);
    }
    // voting for keeping the SQL output, nice way to trace results for expert users
    echo "sql query " . ($runcounter + 1) . ": " . $sql . "<br />";
    $sqlresults = mysql_query($sql);
    // run through the data once to create item counts for cutting and fusing
    $data = array();
    $toplists = array();
    $toplists["col1"] = array();
    $toplists["col2"] = array();
    while ($res = mysql_fetch_assoc($sqlresults)) {
        // ---------------------
        // some cleaning
        $col1 = $res["col1"];
        $col2 = $res["col2"];
        if ($col1_type == "from_user_utcoffset") {
            $col1 .= "s";
        }
        if ($col2_type == "from_user_utcoffset") {
            $col2 .= "s";
        }
        if ($col1_type == "hashtag") {
            $col1 = "#" . $col1;
        }
        // also important to make sure that things don't have the same name (D3 sankey uses node names as ids)
        if ($col2_type == "hashtag") {
            $col2 = "#" . $col2;
        }
        $col1 = preg_replace("/<.+>/U", "", $col1);
        $col1 = preg_replace("/[ \\s\t]+/", " ", $col1);
        $col1 = trim($col1);
        if ($col1 == "") {
            $col1 = "empty";
        }
        $col1 = addslashes($col1);
        $res["col1"] = $col1;
        $col2 = preg_replace("/<.+>/U", "", $col2);
        $col2 = preg_replace("/[ \\s\t]+/", " ", $col2);
        $col2 = trim($col2);
        $col2 = addslashes($col2);
        if ($col2 == "") {
            $col2 = "empty";
        }
        $res["col2"] = $col2;
        // ---------------------
        // counting results
        if (!isset($toplists["col1"][$col1])) {
            $toplists["col1"][$col1] = 0;
        }
        $toplists["col1"][$col1]++;
        if (!isset($toplists["col2"][$col2])) {
            $toplists["col2"][$col2] = 0;
        }
        $toplists["col2"][$col2]++;
        $data[] = $res;
    }
    // ---------------------
    // cut off elements for the sankey column
    foreach ($toplists as $key => $list) {
        arsort($toplists[$key]);
    }
    if ($col2_cutoff != 0) {
        $toplists["col2"] = array_slice($toplists["col2"], 0, $col2_cutoff);
    }
    // if there are three columns, it's necessary to have same toplist for middle column because the two queries may have different results
    // first query specifies toplist for the second column
    if ($runcounter == 0) {
        if ($col1_cutoff != 0) {
            $toplists["col1"] = array_slice($toplists["col1"], 0, $col1_cutoff);
        }
        $oldtoplists = $toplists;
    } else {
        $oldtoplist = array_keys($oldtoplists["col2"]);
        $tmplist = array();
        foreach ($toplists["col1"] as $topkey => $topvalue) {
            if (in_array($topkey, $oldtoplist)) {
                $tmplist[$topkey] = $topvalue;
            }
        }
        $toplists["col1"] = $tmplist;
    }
    // ---------------------
    // create the sankey network
    if (!isset($network)) {
        $network = array();
        $network["nodes"] = array();
        $network["links"] = array();
        $translate = array();
    }
    foreach ($data as $res) {
        $col1 = $res['col1'];
        $col2 = $res['col2'];
        if (!isset($toplists["col1"][$col1])) {
            if ($discard_other == "on") {
                continue;
            }
            $col1 = "other " . $col1_type;
        }
        if (!isset($toplists["col2"][$col2])) {
            if ($discard_other == "on") {
                continue;
            }
            $col2 = "other " . $col2_type;
        }
        if (!in_array($col1, $network["nodes"])) {
            $network["nodes"][] = $col1;
        }
        if (!in_array($col2, $network["nodes"])) {
            $network["nodes"][] = $col2;
        }
        $edge = $col1 . "_XXX_" . $col2;
        if (!isset($network["links"][$edge])) {
            $network["links"][$edge] = 0;
        }
        $network["links"][$edge]++;
    }
}
Пример #2
0
	
        </script>

    </head>

    <body>

        <h1>TCAT :: Export URLs</h1>

        <?php 
validate_all_variables();
$filename = get_filename_for_export('urlsExport');
$csv = new CSV($filename, $outputformat);
$csv->writeheader(array('tweet_id', 'url', 'url_expanded', 'url_followed'));
$sql = "SELECT t.id as id, u.url as url, u.url_expanded as url_expanded, u.url_followed as url_followed FROM " . $esc['mysql']['dataset'] . "_tweets t, " . $esc['mysql']['dataset'] . "_urls u ";
$sql .= sqlSubset();
$sql .= " AND u.tweet_id = t.id ORDER BY id";
$sqlresults = mysql_unbuffered_query($sql);
$out = "";
if ($sqlresults) {
    while ($data = mysql_fetch_assoc($sqlresults)) {
        $csv->newrow();
        $csv->addfield($data['id'], 'integer');
        $csv->addfield($data['url'], 'string');
        if (isset($data['url_followed']) && strlen($data['url_followed']) > 1) {
            $csv->addfield($data['url'], 'string');
        } else {
            $csv->addfield('', 'string');
        }
        if (isset($data['url_expanded']) && strlen($data['url_expanded']) > 1) {
            $csv->addfield($data['url_expanded'], 'string');
Пример #3
0
        <h1>TCAT :: Mention graph</h1>

        <?php 
validate_all_variables();
$collation = current_collation();
$users = array();
$usersinv = array();
$edges = array();
$cur = 0;
$numresults = 500000;
//print_r($esc); exit;
while ($numresults == 500000) {
    $sql = "SELECT m.from_user_name COLLATE {$collation} as from_user_name, m.to_user COLLATE {$collation} as to_user FROM " . $esc['mysql']['dataset'] . "_mentions m, " . $esc['mysql']['dataset'] . "_tweets t ";
    $where = "m.tweet_id = t.id AND ";
    $sql .= sqlSubset($where);
    $sql .= " LIMIT " . $cur . "," . $numresults;
    //print $sql."<br>";
    $sqlresults = mysql_query($sql);
    while ($data = mysql_fetch_assoc($sqlresults)) {
        $data["from_user_name"] = strtolower($data["from_user_name"]);
        $data["to_user"] = strtolower($data["to_user"]);
        if (!isset($users[$data["from_user_name"]])) {
            $users[$data["from_user_name"]] = $arrayName = array('id' => count($usersinv), 'notweets' => 1, 'nomentions' => 0);
            $usersinv[] = $data["from_user_name"];
        } else {
            $users[$data["from_user_name"]]["notweets"]++;
        }
        if (!isset($users[$data["to_user"]])) {
            $users[$data["to_user"]] = $arrayName = array('id' => count($usersinv), 'notweets' => 0, 'nomentions' => 1);
            $usersinv[] = $data["to_user"];
Пример #4
0
//print $sql . "<bR>";
$sqlresults = mysql_unbuffered_query($sql);
while ($res = mysql_fetch_assoc($sqlresults)) {
    $word = $res['h1'];
    $coword->distinctUsersForWord[$word] = $res['d'];
    $coword->userDiversity[$word] = round($res['d'] / $res['c'] * 100, 2);
    $coword->wordFrequency[$word] = $res['c'];
    $coword->wordFrequencyDividedByUniqueUsers[$word] = round($res['c'] / $res['d'], 2);
    $coword->wordFrequencyMultipliedByUniqueUsers[$word] = $res['c'] * $res['d'];
}
mysql_free_result($sqlresults);
// do the actual job
// get cowords
$sql = "SELECT LOWER(A.text COLLATE {$collation}) AS h1, LOWER(B.text COLLATE {$collation}) AS h2 ";
$sql .= "FROM " . $esc['mysql']['dataset'] . "_hashtags A, " . $esc['mysql']['dataset'] . "_hashtags B, " . $esc['mysql']['dataset'] . "_tweets t ";
$sql .= sqlSubset() . " AND ";
$sql .= "LENGTH(A.text)>1 AND LENGTH(B.text)>1 AND ";
$sql .= "LOWER(A.text COLLATE {$collation}) < LOWER(B.text COLLATE {$collation}) AND A.tweet_id = t.id AND A.tweet_id = B.tweet_id ";
$sql .= "ORDER BY h1,h2";
//print $sql."<br>";
$sqlresults = mysql_query($sql);
while ($res = mysql_fetch_assoc($sqlresults)) {
    $coword->addWord($res['h1']);
    $coword->addWord($res['h2']);
    $coword->addCoword($res['h1'], $res['h2'], 1);
}
unset($coword->words);
// as we are adding words manually the frequency would be messed up
if ($esc['shell']['minf'] > 1 && !($esc['shell']['topu'] > 0)) {
    $coword->applyMinFreq($esc['shell']['minf']);
    //$coword->applyMinDegree($esc['shell']['minf']);	// Berno: method no longer in use, remains unharmed
Пример #5
0
function sentiment_avgs()
{
    global $esc, $period;
    $avgs = array();
    // all sentiments
    $sql = "SELECT avg(s.positive) as pos, avg(s.negative) as neg, ";
    if ($period == "day") {
        // @todo
        $sql .= "DATE_FORMAT(t.created_at,'%Y.%d.%m') datepart ";
    } else {
        $sql .= "DATE_FORMAT(t.created_at,'%d. %H:00h') datepart ";
    }
    $sql .= "FROM " . $esc['mysql']['dataset'] . "_tweets t, ";
    $sql .= $esc['mysql']['dataset'] . "_sentiment s ";
    $sql .= sqlSubset("t.id = s.tweet_id AND ");
    $sql .= "GROUP BY datepart ORDER BY t.created_at";
    $rec = mysql_unbuffered_query($sql);
    while ($res = mysql_fetch_assoc($rec)) {
        $neg = $res['neg'];
        $pos = $res['pos'];
        $avgs[$res['datepart']][0] = (double) $pos;
        $avgs[$res['datepart']][1] = (double) abs($neg);
    }
    mysql_free_result($rec);
    // only subjective
    $sql = "SELECT avg(s.positive) as pos, avg(s.negative) as neg, ";
    if ($period == "day") {
        // @todo
        $sql .= "DATE_FORMAT(t.created_at,'%Y.%d.%m') datepart ";
    } else {
        $sql .= "DATE_FORMAT(t.created_at,'%d. %H:00h') datepart ";
    }
    $sql .= "FROM " . $esc['mysql']['dataset'] . "_tweets t, ";
    $sql .= $esc['mysql']['dataset'] . "_sentiment s ";
    $sql .= sqlSubset("t.id = s.tweet_id AND (s.positive != 1 AND s.negative != 1) AND ");
    $sql .= "GROUP BY datepart ORDER BY t.created_at";
    $rec = mysql_unbuffered_query($sql);
    while ($res = mysql_fetch_assoc($rec)) {
        $neg = $res['neg'];
        $pos = $res['pos'];
        $avgs[$res['datepart']][2] = (double) $pos;
        $avgs[$res['datepart']][3] = (double) abs($neg);
    }
    mysql_free_result($rec);
    // only dateparts
    $sql = "SELECT ";
    if ($period == "day") {
        // @todo
        $sql .= "DATE_FORMAT(t.created_at,'%Y.%d.%m') datepart ";
    } else {
        $sql .= "DATE_FORMAT(t.created_at,'%d. %H:00h') datepart ";
    }
    $sql .= "FROM " . $esc['mysql']['dataset'] . "_tweets t ";
    $sql .= sqlSubset();
    $sql .= "GROUP BY datepart";
    // initialize with empty dates
    $curdate = strtotime($esc['datetime']['startdate']);
    while ($curdate < strtotime($esc['datetime']['enddate'])) {
        $thendate = $period == "day" ? $curdate + 86400 : $curdate + 3600;
        $tmp = $period == "day" ? strftime("%Y.%d.%m", $curdate) : strftime("%d. %H:%M", $curdate) . "h";
        if (!isset($avgs[$tmp])) {
            $avgs[$tmp] = array();
        }
        $curdate = $thendate;
    }
    return $avgs;
}
function printTopHashtags()
{
    global $esc;
    $collation = current_collation();
    $results = array();
    $sql = "SELECT COUNT(hashtags.text COLLATE {$collation}) AS count, LOWER(hashtags.text COLLATE {$collation}) AS toget ";
    $sql .= "FROM " . $esc['mysql']['dataset'] . "_hashtags hashtags, " . $esc['mysql']['dataset'] . "_tweets t ";
    $sql .= sqlSubset("t.id = hashtags.tweet_id AND ");
    $sql .= " GROUP BY toget ORDER BY count DESC limit 10";
    //print $sql."<br>";
    $rec = mysql_query($sql);
    $out = "";
    while ($res = mysql_fetch_assoc($rec)) {
        $out .= $res['toget'] . " (" . $res['count'] . "), ";
    }
    print substr($out, 0, -2);
}
Пример #7
0
	
	
        </script>

    </head>

    <body>

        <h1>TCAT :: Sentiment Cloud</h1>

        <?php 
validate_all_variables();
$filename = get_filename_for_export("sentiment_cloud");
$csv = new CSV($filename, $outputformat);
$sql = "SELECT s.explanation FROM " . $esc['mysql']['dataset'] . "_tweets t, " . $esc['mysql']['dataset'] . "_sentiment s ";
$sql .= sqlSubset("s.tweet_id = t.id AND ");
//print $sql . "<br>";die;
$rec = mysql_unbuffered_query($sql);
$negativeSentiments = $positiveSentiments = $wordValues = array();
while ($res = mysql_fetch_assoc($rec)) {
    if (preg_match_all("/[\\s|\\B]([\\p{L}\\w\\d_]+)\\[(-?\\d)\\]/u", $res['explanation'], $matches)) {
        foreach ($matches[1] as $k => $word) {
            $word = strtolower(trim($word));
            $sentimentValue = (int) $matches[2][$k];
            if ($sentimentValue < 0) {
                if (array_key_exists($word, $negativeSentiments) === false) {
                    $negativeSentiments[$word] = 0;
                }
                $negativeSentiments[$word]++;
            } else {
                if (array_key_exists($word, $positiveSentiments) === false) {
Пример #8
0
    die;
}
// make filename and open file for write
if ($bin_type == "geotrack") {
    $module = "rateLimitDataGeo";
} else {
    $module = "ratelimitData";
}
$module .= "-" . $esc['date']['interval'];
$filename = get_filename_for_export($module);
$csv = new CSV($filename, $outputformat);
// write header
$header = "querybin,datetime,tweets ratelimited (estimate)";
$csv->writeheader(explode(',', $header));
$sqlInterval = sqlInterval();
$sqlSubset = sqlSubset();
$sqlGroup = " GROUP BY datepart ASC";
// Use native MySQL to create a temporary table with all dateparts. They should be identical to the dateparts we will use in the GROUP BY statement.
// Prepare the string mysql needs in date_add()
$mysqlNativeInterval = "day";
// default $interval = daily
switch ($esc['date']['interval']) {
    case "hourly":
        $mysqlNativeInterval = "hour";
        break;
    case "daily":
        $mysqlNativeInterval = "day";
        break;
    case "weekly":
        $mysqlNativeInterval = "week";
        break;
function printTopHashtags()
{
    global $esc, $method;
    $collation = current_collation();
    $sql_interval = "DATE_FORMAT(t.created_at,'%Y-%m-%d') datepart ";
    if ($method == "word") {
        $sql = "SELECT COUNT(h.text COLLATE {$collation}) AS count, h.text COLLATE {$collation} AS toget, ";
        $sql .= $sql_interval;
        $sql .= "FROM " . $esc['mysql']['dataset'] . "_pos h, " . $esc['mysql']['dataset'] . "_tweets t ";
        $where = "t.id = h.tweet_id AND ";
        $sql .= sqlSubset($where);
        $sql .= " GROUP BY toget ORDER BY count DESC limit 10";
    } else {
        $sql = "SELECT COUNT(hashtags.text COLLATE {$collation}) AS count, LOWER(hashtags.text COLLATE {$collation}) AS toget, ";
        $sql .= $sql_interval;
        $sql .= "FROM " . $esc['mysql']['dataset'] . "_hashtags hashtags, " . $esc['mysql']['dataset'] . "_tweets t ";
        $where = "t.id = hashtags.tweet_id AND ";
        $sql .= sqlSubset($where);
        $sql .= " GROUP BY toget ORDER BY count DESC limit 10";
    }
    //print $sql . "<br>"; flush();
    $rec = mysql_query($sql);
    $out = "";
    while ($res = mysql_fetch_assoc($rec)) {
        //if ($res['count'] > $esc['shell']['minf'])
        $out .= $res['toget'] . " (" . $res['count'] . "), ";
    }
    print substr($out, 0, -2);
}