/** * This function merges sorted files into one (big) sorted file, * according to 2 comparator indexes * * @param $filename, name of the file that will be generated * @param $filesToMerge, array containing the names of the files to be merged * @param $nbLogParam, number of log parameters per line * @param $comparator1, the first comparator index (0: song_id, 1: usr_id, 2: country_code) * @param $comparator2, the second comparator index (0: song_id, 1: usr_id, 2: country_code) * @param $deleteFiles, boolean that indicates if the files must be deleted after merging them * @return $nbFiles, number of files that have been merged */ function mergeFiles($filename, $filesToMerge, $nbLogParam, $comparator1, $comparator2, $deleteFiles = true) { global $red, $green, $blue, $noColor, $OK; $nbFilesEnded = 0; // Number of files that have reached the EOF echo "Now merging files... "; // Files are opened (if existing) $fh = array(); $nbFiles = 0; foreach ($filesToMerge as $file) { if (file_exists($file)) { $fh[] = fopen($file, 'r') or die($red . "Oops, couldn't open " . $file . "!" . $noColor . "\n\n"); $nbFiles++; } else { echo $file . " not found, it will not be merged...\n"; } } if ($nbFiles < 2) { echo "(No need to merge, operation canceled)\n"; foreach ($fh as $fhToClose) { fclose($fhToClose); } return $nbFiles; } // The sorted file is created $fp = fopen($filename, 'w') or die($red . "Oops, couldn't create a new file!" . $noColor . "\n\n"); // A heap is created to store the current line of each file, and to keep them sorted $heap = new MyHeap($comparator1, $comparator2); // Get the first value of each file, and add it to the heap (which remains sorted) for ($i = 0; $i < $nbFiles; $i++) { if (!feof($fh[$i])) { // At this point, data shall not be corrupted, nor empty $line = trim(fgets($fh[$i])); $row = explode('|', $line); // Trick: the index of the file associated to the data is also stored in the heap $row[] = $i; if (count($row) == $nbLogParam + 1) { $heap->insert($row); } else { // Corruption has already been detected before; // if $row doesn't contain the whole data, something wrong must have happened meanwhile exit($red . "Unexpected data corruption occurred... Script aborted!" . $noColor . "\n"); } } else { // File shouldn't be empty at this point; if it is, something wrong must have happened meanwhile exit($red . "Unexpected EOF reached... Script aborted!" . $noColor . "\n"); } } // Extract the smallest data from the heap, and write it in the final file if (!$heap->isEmpty()) { $chunkLine = $heap->extract(); // Extraction of data $indexOfLastFileRead = array_pop($chunkLine); // Extraction of index of the file from which data is merged fwrite($fp, implode('|', $chunkLine) . "\n"); // Writing data in the final file } else { // At this point, the heap shouldn't be empty; if it is, something wrong must have happened exit($red . "Heap is unexpectedly empty... Script aborted!" . $noColor . "\n"); } // Then the files are fully read and merged (until each of them reaches its EOF) while ($nbFilesEnded < $nbFiles) { // The tricky part was to merge one line at a time, not the whole heap, and keep a trace of the last file read $i = $indexOfLastFileRead; // Reading the next line of the file from which data has just been merged if (!feof($fh[$i])) { $line = trim(fgets($fh[$i])); $row = explode('|', $line); $row[] = $i; if (count($row) == $nbLogParam + 1) { // If there is data, it is added to the heap $heap->insert($row); } else { // Data is expected to have ($nbLogParam + 1) values; // if not, it means that the end of this file has been reached (case of last line empty) $nbFilesEnded++; } } else { $nbFilesEnded++; } // Writing the next sorted line into the final file if (!$heap->isEmpty()) { $chunkLine = $heap->extract(); $indexOfLastFileRead = array_pop($chunkLine); fwrite($fp, implode('|', $chunkLine) . "\n"); } } // Closing all the files and deleting files if needed echo $OK . "Cleaning up... "; foreach ($fh as $fhToClose) { fclose($fhToClose); } if ($deleteFiles) { foreach ($filesToMerge as $file) { if (file_exists($file)) { unlink($file); } } } fclose($fp); unset($fh); unset($heap); echo $OK . $green . "The " . $nbFiles . " files have been merged successfully!" . $noColor . "\n"; return $nbFiles; }
/** * This function generates a 'Top' file from a sorted log file * For each criterion (which index is $criterionIndex), * it counts the occurrence of the values which index are $valueIndex * * @param $filename, name of the sorted log file * @param $topFilename, name of the top file that will be generated * @param $topNumber, number of the Top such as 'Top ($topNumber)' is generated * @param $nbLogParam, number of log parameters per line * @param $criterionIndex, the index of the criterion for the top (0: song_id, 1: usr_id, 2: country_code) * @param $valueIndex, the index of the value that will be counted (0: song_id, 1: usr_id, 2: country_code) */ function generateTopFile($filename, $topFilename, $topNumber, $nbLogParam, $criterionIndex, $valueIndex) { global $red, $green, $blue, $noColor, $OK; echo $blue . "*** Generating " . $topFilename . " ***" . $noColor . "\n"; // The sorted log file is opened $fh = fopen($filename, 'r') or die($red . "Oops, couldn't open " . $filename . "!" . $noColor . "\n\n"); // The top file is created $fp = fopen($topFilename, 'w') or die($red . "Oops, couldn't create a new file!" . $noColor . "\n\n"); $currentCriterionID = 0; // ID of the criterion that is currently analyzed $currentValueID = 0; // ID of the value that is currently counted $counter = 0; // Counter for the values // In order to keep the Top updated for each criterion, the data is stored in a MinHeap each time. // The structure of this array is as the following: list($valueID, $counter) = $array; // Thus, MyHeap class is used and sorted according to $counter, and then according to $valueID in case of equality $VALUE_INDEX = 0; $COUNTER_INDEX = 1; $heap = new MyHeap($COUNTER_INDEX, $VALUE_INDEX); echo "Reading and counting data... "; // The first line is read if (!feof($fh)) { $line = trim(fgets($fh)); $row = explode('|', $line); $currentCriterionID = $row[$criterionIndex]; $currentValueID = $row[$valueIndex]; $counter++; } else { // File shouldn't be empty at this point; if it is, something wrong must have happened meanwhile exit($red . "Unexpected EOF reached... Script aborted!" . $noColor . "\n"); } while (!feof($fh)) { $line = trim(fgets($fh)); $row = explode('|', $line); if (count($row) == $nbLogParam) { // When next criterion is detected, the Top of the $currentCriterionID is written in the file if ($currentCriterionID != $row[$criterionIndex]) { // First, insert the last data of $currentCriterionID into the heap $heap->insert(array($currentValueID, $counter)); if ($heap->count() > $topNumber) { $heap->extract(); // Maximum length of the heap is equal to $topNumber } // Empty MinHeap and reverse it (with a stack) to get the Top in descending order // Note: An array is used as a stack (instead of SplStack) for better performance $stack = array(); while (!$heap->isEmpty()) { $stack[] = $heap->extract(); } // Write the Top of the $currentCriterionID (in our example: Top 50) // Format: country|sng_id1:n1,sng_id2:n2,sng_id3:n3,...,sng_id50:n50 $data = array_pop($stack); fwrite($fp, $currentCriterionID . "|" . $data[$VALUE_INDEX] . ":" . $data[$COUNTER_INDEX]); while (!empty($stack)) { $data = array_pop($stack); fwrite($fp, "," . $data[$VALUE_INDEX] . ":" . $data[$COUNTER_INDEX]); } fwrite($fp, "\n"); unset($stack); // Update $currentCriterionID, $currentValueID and $counter $currentCriterionID = $row[$criterionIndex]; $currentValueID = $row[$valueIndex]; $counter = 1; } elseif ($currentValueID != $row[$valueIndex]) { // When next value is detected, insert the last data of $currentCriterionID into the heap $heap->insert(array($currentValueID, $counter)); if ($heap->count() > $topNumber) { $heap->extract(); // } // Then update $currentValueID and $counter $currentValueID = $row[$valueIndex]; $counter = 1; } else { // Else, keep counting $counter++; } } else { // Data is expected to have ($nbLogParam) values; // if not, it means that the end of this file has been reached break; } } // Repeat the process for the last Top $heap->insert(array($currentValueID, $counter)); if ($heap->count() > $topNumber) { $heap->extract(); } $stack = array(); while (!$heap->isEmpty()) { $stack[] = $heap->extract(); } $data = array_pop($stack); fwrite($fp, $currentCriterionID . "|" . $data[$VALUE_INDEX] . ":" . $data[$COUNTER_INDEX]); while (!empty($stack)) { $data = array_pop($stack); fwrite($fp, "," . $data[$VALUE_INDEX] . ":" . $data[$COUNTER_INDEX]); } echo $OK . $blue . "*** DONE! ***" . $noColor . "\n"; unset($stack); unset($heap); fclose($fh); fclose($fp); }
<?php class MyHeap extends SplHeap { public function compare($a, $b) { return $a < $b; } } $heap = new MyHeap(); var_dump($heap->isEmpty()); $heap->insert(1); var_dump($heap->isEmpty()); $heap->extract(); var_dump($heap->isEmpty()); $heap->isEmpty('var');