Ejemplo n.º 1
0
/**
 * This function generates a 'Top' file from a sorted log file
 * For each criterion (which index is $criterionIndex), 
 * it counts the occurrence of the values which index are $valueIndex
 * 
 * @param $filename, name of the sorted log file
 * @param $topFilename, name of the top file that will be generated
 * @param $topNumber, number of the Top such as 'Top ($topNumber)' is generated
 * @param $nbLogParam, number of log parameters per line
 * @param $criterionIndex, the index of the criterion for the top (0: song_id, 1: usr_id, 2: country_code)
 * @param $valueIndex, the index of the value that will be counted (0: song_id, 1: usr_id, 2: country_code)
 */
function generateTopFile($filename, $topFilename, $topNumber, $nbLogParam, $criterionIndex, $valueIndex)
{
    global $red, $green, $blue, $noColor, $OK;
    echo $blue . "*** Generating " . $topFilename . " ***" . $noColor . "\n";
    // The sorted log file is opened
    $fh = fopen($filename, 'r') or die($red . "Oops, couldn't open " . $filename . "!" . $noColor . "\n\n");
    // The top file is created
    $fp = fopen($topFilename, 'w') or die($red . "Oops, couldn't create a new file!" . $noColor . "\n\n");
    $currentCriterionID = 0;
    // ID of the criterion that is currently analyzed
    $currentValueID = 0;
    // ID of the value that is currently counted
    $counter = 0;
    // Counter for the values
    // In order to keep the Top updated for each criterion, the data is stored in a MinHeap each time.
    // The structure of this array is as the following: list($valueID, $counter) = $array;
    // Thus, MyHeap class is used and sorted according to $counter, and then according to $valueID in case of equality
    $VALUE_INDEX = 0;
    $COUNTER_INDEX = 1;
    $heap = new MyHeap($COUNTER_INDEX, $VALUE_INDEX);
    echo "Reading and counting data... ";
    // The first line is read
    if (!feof($fh)) {
        $line = trim(fgets($fh));
        $row = explode('|', $line);
        $currentCriterionID = $row[$criterionIndex];
        $currentValueID = $row[$valueIndex];
        $counter++;
    } else {
        // File shouldn't be empty at this point; if it is, something wrong must have happened meanwhile
        exit($red . "Unexpected EOF reached... Script aborted!" . $noColor . "\n");
    }
    while (!feof($fh)) {
        $line = trim(fgets($fh));
        $row = explode('|', $line);
        if (count($row) == $nbLogParam) {
            // When next criterion is detected, the Top of the $currentCriterionID is written in the file
            if ($currentCriterionID != $row[$criterionIndex]) {
                // First, insert the last data of $currentCriterionID into the heap
                $heap->insert(array($currentValueID, $counter));
                if ($heap->count() > $topNumber) {
                    $heap->extract();
                    // Maximum length of the heap is equal to $topNumber
                }
                // Empty MinHeap and reverse it (with a stack) to get the Top in descending order
                // Note: An array is used as a stack (instead of SplStack) for better performance
                $stack = array();
                while (!$heap->isEmpty()) {
                    $stack[] = $heap->extract();
                }
                // Write the Top of the $currentCriterionID (in our example: Top 50)
                // Format: country|sng_id1:n1,sng_id2:n2,sng_id3:n3,...,sng_id50:n50
                $data = array_pop($stack);
                fwrite($fp, $currentCriterionID . "|" . $data[$VALUE_INDEX] . ":" . $data[$COUNTER_INDEX]);
                while (!empty($stack)) {
                    $data = array_pop($stack);
                    fwrite($fp, "," . $data[$VALUE_INDEX] . ":" . $data[$COUNTER_INDEX]);
                }
                fwrite($fp, "\n");
                unset($stack);
                // Update $currentCriterionID, $currentValueID and $counter
                $currentCriterionID = $row[$criterionIndex];
                $currentValueID = $row[$valueIndex];
                $counter = 1;
            } elseif ($currentValueID != $row[$valueIndex]) {
                // When next value is detected, insert the last data of $currentCriterionID into the heap
                $heap->insert(array($currentValueID, $counter));
                if ($heap->count() > $topNumber) {
                    $heap->extract();
                    //
                }
                // Then update $currentValueID and $counter
                $currentValueID = $row[$valueIndex];
                $counter = 1;
            } else {
                // Else, keep counting
                $counter++;
            }
        } else {
            // Data is expected to have ($nbLogParam) values;
            // if not, it means that the end of this file has been reached
            break;
        }
    }
    // Repeat the process for the last Top
    $heap->insert(array($currentValueID, $counter));
    if ($heap->count() > $topNumber) {
        $heap->extract();
    }
    $stack = array();
    while (!$heap->isEmpty()) {
        $stack[] = $heap->extract();
    }
    $data = array_pop($stack);
    fwrite($fp, $currentCriterionID . "|" . $data[$VALUE_INDEX] . ":" . $data[$COUNTER_INDEX]);
    while (!empty($stack)) {
        $data = array_pop($stack);
        fwrite($fp, "," . $data[$VALUE_INDEX] . ":" . $data[$COUNTER_INDEX]);
    }
    echo $OK . $blue . "*** DONE! ***" . $noColor . "\n";
    unset($stack);
    unset($heap);
    fclose($fh);
    fclose($fp);
}