function prepareDataset() { global $FileNames; loadFileNames(); loadTweetIDs(); loadKeywords(); foreach ($FileNames as $file_name) { collectTweetsFrom($file_name); } storeTweetIDs(); }
function prepareTestset($dataset_dir, $dataset_name) { global $KeyWords, $KeyWords_left, $testset_size; loadKeywords(); $dataset_size = countDataset($dataset_dir . $dataset_name); $max_random = $dataset_size / constant("NUMBER_OF_SLICE"); $testset_name = "test." . $dataset_name . "." . date("mdHi"); $fp_dataset = fopen($dataset_dir . $dataset_name . ".csv", "r"); $fp_testset = fopen(constant("TESTSET_LOCATION") . $testset_name . ".csv", "a"); $no_in_ts = 0; $selected = array(); while ($no_in_ts < $testset_size) { //$jump=mt_rand(0, 10000); $jump = mt_rand(0, $max_random); for (; $jump >= 0; $jump--) { if (feof($fp_dataset)) { $fp_dataset = fopen($dataset_dir . $dataset_name . ".csv", "r"); } fgetcsv($fp_dataset, 256); } if (feof($fp_dataset)) { $fp_dataset = fopen($dataset_dir . $dataset_name . ".csv", "r"); } $new_tweet = fgetcsv($fp_dataset, 256); if (!in_array($new_tweet[2], $selected) && $KeyWords_left[$new_tweet[0]] > 0) { array_push($selected, $new_tweet[2]); $KeyWords_left[$new_tweet[0]]--; $new_tweet[0] = $KeyWords[$new_tweet[0]]; unset($new_tweet[2]); fputcsv($fp_testset, $new_tweet); $no_in_ts++; print $no_in_ts . ", "; } } fclose($fp_dataset); fclose($fp_testset); }