echo 'checked="checked"';
}
?>
 /> discard "other" from diagram
            </div>

            <div class="form_row">
                <input type="submit" />
            </div>
        </form>


        <?php 
validate_all_variables();
global $collation;
$collation = current_collation();
// make sure that all columns are different
if ($_GET["col1_type"] == $_GET["col2_type"] || $_GET["col2_type"] == $_GET["col3_type"] || $_GET["col1_type"] == $_GET["col3_type"]) {
    echo "all columns must be different";
    exit;
}
// get the full tweet count
$sql = "SELECT count(distinct(t.id)) as count FROM " . $esc['mysql']['dataset'] . "_tweets t ";
$sql .= sqlSubset();
$sqlresults = mysql_query($sql);
$data = mysql_fetch_assoc($sqlresults);
$fulltweetcount = $data["count"];
// process colums
getFlow($_GET["col1_type"], $_GET["col1_cutoff"], $_GET["col2_type"], $_GET["col2_cutoff"], $_GET["discard_other"], 0);
if ($_GET["col3_type"] != "none") {
    getFlow($_GET["col2_type"], $_GET["col2_cutoff"], $_GET["col3_type"], $_GET["col3_cutoff"], $_GET["discard_other"], 1);
function printTopHashtags()
{
    global $esc;
    $collation = current_collation();
    $results = array();
    $sql = "SELECT COUNT(hashtags.text COLLATE {$collation}) AS count, LOWER(hashtags.text COLLATE {$collation}) AS toget ";
    $sql .= "FROM " . $esc['mysql']['dataset'] . "_hashtags hashtags, " . $esc['mysql']['dataset'] . "_tweets t ";
    $sql .= sqlSubset("t.id = hashtags.tweet_id AND ");
    $sql .= " GROUP BY toget ORDER BY count DESC limit 10";
    //print $sql."<br>";
    $rec = mysql_query($sql);
    $out = "";
    while ($res = mysql_fetch_assoc($rec)) {
        $out .= $res['toget'] . " (" . $res['count'] . "), ";
    }
    print substr($out, 0, -2);
}
Esempio n. 3
0
function generate($what, $filename)
{
    global $tsv, $network, $esc, $titles, $database, $interval, $outputformat;
    require_once __DIR__ . '/CSV.class.php';
    // initialize variables
    $tweets = $times = $from_user_names = $results = $urls = $urls_expanded = $hosts = $hashtags = array();
    $csv = new CSV($filename, $outputformat);
    $collation = current_collation();
    // determine interval
    $sql = "SELECT MIN(t.created_at) AS min, MAX(t.created_at) AS max FROM " . $esc['mysql']['dataset'] . "_tweets t ";
    $sql .= sqlSubset();
    //print $sql . "<bR>";
    $rec = mysql_query($sql);
    $res = mysql_fetch_assoc($rec);
    // get frequencies
    if ($what == "hashtag") {
        $results = frequencyTable("hashtags", "text");
    } elseif ($what == "urls") {
        $results = frequencyTable("urls", "url_followed");
    } elseif ($what == "hosts") {
        $results = frequencyTable("urls", "domain");
    } elseif ($what == "mention") {
        $results = frequencyTable("mentions", "to_user");
        // get other things
    } else {
        // @todo, this could also use database grouping
        $sql = "SELECT id,text COLLATE {$collation} as text,created_at,from_user_name COLLATE {$collation} as from_user_name FROM " . $esc['mysql']['dataset'] . "_tweets t ";
        $sql .= sqlSubset();
        // get slice and its min and max time
        $rec = mysql_query($sql);
        if ($rec && mysql_num_rows($rec) > 0) {
            while ($res = mysql_fetch_assoc($rec)) {
                $tweets[] = $res['text'];
                $ids[] = $res['id'];
                $times[] = $res['created_at'];
                $from_user_names[] = strtolower($res['from_user_name']);
            }
        }
        // extract desired things ($what) and group per interval
        foreach ($tweets as $key => $tweet) {
            $time = $times[$key];
            switch ($interval) {
                case "hourly":
                    $group = strftime("%Y-%m-%d %Hh", strtotime($time));
                    break;
                case "weekly":
                    $group = strftime("%Y %u", strtotime($time));
                    break;
                case "monthly":
                    $group = strftime("%Y-%m", strtotime($time));
                    break;
                case "yearly":
                    $group = strftime("%Y-%m", strtotime($time));
                    break;
                case "overall":
                    $group = "overall";
                    break;
                case "custom":
                    $group = groupByInterval(strftime("%Y-%m-%d", strtotime($time)));
                    break;
                default:
                    $group = strftime("%Y-%m-%d", strtotime($time));
                    // default daily
            }
            switch ($what) {
                //case "hashtag":
                //    foreach ($hashtags as $hashtag)
                //        $results[$group][] = $hashtag;
                //    break;
                //case "mention": // @todo, mentions might be taken from own table
                //    $stuff = get_replies($tweet);
                //    foreach ($stuff as $thing)
                //        $results[$group][] = $thing;
                //    break;
                case "user":
                    $results[$group][] = $from_user_names[$key];
                    break;
                case "user-mention":
                    $stuff = get_replies($tweet);
                    foreach ($stuff as $thing) {
                        $results[$group]['mentions'][] = $thing;
                    }
                    $results[$group]['users'][] = $from_user_names[$key];
                    //var_dump($results);
                    break;
                case "retweet":
                    $results[$group][] = $tweet;
                    // TODO, write stemming function
                    break;
                    //case "urls":
                    //    if (isset($urls_expanded[$ids[$key]]))
                    //        $results[$group][] = $urls_expanded[$ids[$key]];
                    //    break;
                    //case "hosts":
                    //    if (isset($urls_expanded[$ids[$key]]))
                    //        $results[$group][] = $hosts[$ids[$key]];
                    //    break;
                //case "urls":
                //    if (isset($urls_expanded[$ids[$key]]))
                //        $results[$group][] = $urls_expanded[$ids[$key]];
                //    break;
                //case "hosts":
                //    if (isset($urls_expanded[$ids[$key]]))
                //        $results[$group][] = $hosts[$ids[$key]];
                //    break;
                default:
                    break;
            }
        }
        // count frequency of occurence of thing, per interval
        if ($what != "user-mention") {
            foreach ($results as $group => $things) {
                $counted_things = array_count_values($things);
                arsort($counted_things);
                $results[$group] = $counted_things;
            }
        }
    }
    // network output for users
    if ($what == "user-mention") {
        foreach ($results as $group => $things) {
            $tmp_mentions = array_count_values($things['mentions']);
            $tmp_users = array_count_values($things['users']);
            $counted_things = array();
            // add all from_user_names
            foreach ($tmp_users as $user => $count) {
                if (isset($tmp_mentions["@" . $user])) {
                    $counted_things[$user] = $tmp_mentions["@" . $user] . "," . $count;
                } else {
                    $counted_things[$user] = "0," . $count;
                }
            }
            // add all users which were replied but not in the set
            foreach ($tmp_mentions as $user => $count) {
                $user = str_replace("@", "", $user);
                if (!isset($counted_things[$user])) {
                    $counted_things[$user] = $count . ",0";
                }
            }
            ksort($counted_things);
            $results[$group] = $counted_things;
        }
        if (isset($titles[$what])) {
            if (!empty($esc['shell']['query'])) {
                $q = " with search " . $esc['shell']['query'];
            } else {
                $q = "";
            }
            $csv->writeheader(array($titles[$what] . $q . " from " . $esc['date']["startdate"] . " to " . $esc['date']["enddate"]));
        }
        $csv->writeheader(array("date", "user", "mentions", "tweets"));
        foreach ($results as $group => $things) {
            foreach ($things as $thing => $count) {
                $csv->newrow();
                $csv->addfield($group);
                $csv->addfield($thing);
                $exp = explode(",", $count);
                // unpack what we packed
                $csv->addfield($exp[0]);
                $csv->addfield($exp[1]);
                $csv->writerow();
            }
        }
        // write tsv output
    } elseif (in_array($what, $tsv) !== false) {
        ksort($results);
        // construct file
        if (isset($titles[$what])) {
            if (!empty($esc['shell']['query'])) {
                $q = " with search " . $esc['shell']['query'];
            } else {
                $q = "";
            }
            $csv->writeheader(array($titles[$what] . " for " . $esc['shell']['datasetname'] . $q . " from " . $esc['date']["startdate"] . " to " . $esc['date']["enddate"]));
        }
        if ($what == "urls") {
            $csv->writeheader(array("date", "frequency", "tweetedurl"));
        } elseif ($what == "hosts") {
            $csv->writeheader(array("date", "frequency", "domain", "name"));
        } else {
            $csv->writeheader(array("date", "frequency", $what));
        }
        foreach ($results as $group => $things) {
            arsort($things);
            foreach ($things as $thing => $count) {
                if (empty($thing)) {
                    continue;
                }
                if ($count < $esc['shell']['minf']) {
                    continue;
                }
                $csv->newrow();
                $csv->addfield($group);
                $csv->addfield($count);
                $csv->addfield($thing);
                $csv->writerow();
            }
        }
    } else {
        die('no valid output format found');
    }
    $csv->close();
}
function printTopHashtags()
{
    global $esc, $method;
    $collation = current_collation();
    $sql_interval = "DATE_FORMAT(t.created_at,'%Y-%m-%d') datepart ";
    if ($method == "word") {
        $sql = "SELECT COUNT(h.text COLLATE {$collation}) AS count, h.text COLLATE {$collation} AS toget, ";
        $sql .= $sql_interval;
        $sql .= "FROM " . $esc['mysql']['dataset'] . "_pos h, " . $esc['mysql']['dataset'] . "_tweets t ";
        $where = "t.id = h.tweet_id AND ";
        $sql .= sqlSubset($where);
        $sql .= " GROUP BY toget ORDER BY count DESC limit 10";
    } else {
        $sql = "SELECT COUNT(hashtags.text COLLATE {$collation}) AS count, LOWER(hashtags.text COLLATE {$collation}) AS toget, ";
        $sql .= $sql_interval;
        $sql .= "FROM " . $esc['mysql']['dataset'] . "_hashtags hashtags, " . $esc['mysql']['dataset'] . "_tweets t ";
        $where = "t.id = hashtags.tweet_id AND ";
        $sql .= sqlSubset($where);
        $sql .= " GROUP BY toget ORDER BY count DESC limit 10";
    }
    //print $sql . "<br>"; flush();
    $rec = mysql_query($sql);
    $out = "";
    while ($res = mysql_fetch_assoc($rec)) {
        //if ($res['count'] > $esc['shell']['minf'])
        $out .= $res['toget'] . " (" . $res['count'] . "), ";
    }
    print substr($out, 0, -2);
}