-
Notifications
You must be signed in to change notification settings - Fork 0
/
SortingScript.php
153 lines (129 loc) · 5.02 KB
/
SortingScript.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
<?php
include 'MergeFiles.php'; // Script to merge sorted files
/**
* Function that sorts input log file (according to 2 comparator indexes)
* by reading one chunk of this file at a time, then applying merge sort
* algorithm on each chunk, and finally merge them into a sorted file
*
* In our example, there are only 3 possible values for comparators:
* 0: song_id
* 1: usr_id
* 2: country_code
*
* However, this function has been designed as a generic function,
* and it accepts more than 3 log parameters per line
*
* @param $filename, name of the log file that will be sorted
* @param $sortedFilename, name of the sorted file that will be generated
* @param $logParamPatterns, array containing the patterns of each log parameter (to detect data corruption)
* @param $comparator1, the first comparator index (0: song_id, 1: usr_id, 2: country_code)
* @param $comparator2, the second comparator index (0: song_id, 1: usr_id, 2: country_code)
*/
function sortLogFile($filename, $sortedFilename, $logParamPatterns, $comparator1, $comparator2) {
global $red, $green, $blue, $noColor, $OK, $INDEXES;
// Number of lines for each chunk of file (this length should depend on the length of the file to sort)
$CHUNK_LENGTH = 10000;
echo $blue."*** Sorting ".$filename." according to ".$INDEXES[$comparator1]." ***".$noColor."\n";
$fh = fopen($filename, 'r') or die($red."Oops, couldn't open ".$filename."!".$noColor."\n\n");
$nbTmpFiles = 0; // Number of temporary files that will be created
$nbLogParam = count($logParamPatterns);
while(!feof($fh)) {
$i = 0;
$chunk = array(); // Chunk of data
echo "Creating and sorting chunk file n°".$nbTmpFiles."... ";
// Reading a chunk of desired length
while($i < $CHUNK_LENGTH && !feof($fh)) {
$line = trim(fgets($fh));
$row = explode('|',$line);
// Check if data is corrupted
if(count($row) == $nbLogParam) {
// Advanced detection of data corruption with pattern matching
$isCorrupted = false;
for($j=0; $j < $nbLogParam; $j++) {
if(preg_match($logParamPatterns[$j], $row[$j]) != 1) {
$isCorrupted = true;
break;
}
}
if(!$isCorrupted) {
$chunk[] = $row; // If data is not corrupted, it is added to the chunk
} else {
//echo "\n(Data corruption detected)\n";
}
}
$i++;
}
// Then the chunk is sorted
$chunk = mergeSort($chunk, $comparator1, $comparator2);
// Store it into a chunk file
$fp = fopen("chunk_".$nbTmpFiles.".log", 'w') or die($red."Oops, couldn't create a new file!".$noColor."\n\n");
foreach($chunk as $chunkLine) {
fwrite($fp, implode('|',$chunkLine)."\n");
}
fclose($fp);
$nbTmpFiles++;
unset($chunk);
echo $OK;
}
fclose($fh);
echo $green."Chunk files sorted!".$noColor."\n";
$chunkNames = array();
for($i=0; $i < $nbTmpFiles; $i++) {
$chunkNames[] = "chunk_".$i.".log";
}
mergeFiles($sortedFilename, $chunkNames, $nbLogParam, $comparator1, $comparator2);
echo $blue."*** DONE! ***".$noColor."\n";
}
/**
* The famous merge sort algorithm implementation
*
* $data is an array formatted as follow (with $i as an index):
* list($song_id, $usr_id, $country_code) = $data[$i];
*
* @param $data, a multidimensional array containing the log data
* @param $comparator1, the first comparator index (0: song_id, 1: usr_id, 2: country_code)
* @param $comparator2, the second comparator index (0: song_id, 1: usr_id, 2: country_code)
*/
function mergeSort($data, $comparator1, $comparator2) {
if(count($data) <= 1) return $data;
$mid = count($data) / 2;
$left = array_slice($data, 0, $mid);
$right = array_slice($data, $mid);
$left = mergeSort($left, $comparator1, $comparator2);
$right = mergeSort($right, $comparator1, $comparator2);
return merge($left, $right, $comparator1, $comparator2);
}
/**
* The merging function of the merge sort algorithm
* The sorting mecanism is done according to the first comparator index,
* and in case of equality, the comparison is made with the second one
*
* @param $left, the left part of $data array (from mergeSort function), recursively in the merge sort algorithm process
* @param $right, the right part of $data array (from mergeSort function), recursively in the merge sort algorithm process
* @param $comparator1, the first comparator index (0: song_id, 1: usr_id, 2: country_code)
* @param $comparator2, the second comparator index (0: song_id, 1: usr_id, 2: country_code)
*/
function merge($left, $right, $comparator1, $comparator2) {
$result = array();
while(count($left) > 0 && count($right) > 0) {
if($left[0][$comparator1] > $right[0][$comparator1] ||
($left[0][$comparator1] == $right[0][$comparator1] &&
$left[0][$comparator2] > $right[0][$comparator2])) {
$result[] = $right[0];
$right = array_slice($right , 1);
} else {
$result[] = $left[0];
$left = array_slice($left, 1);
}
}
while (count($left) > 0) {
$result[] = $left[0];
$left = array_slice($left, 1);
}
while (count($right) > 0) {
$result[] = $right[0];
$right = array_slice($right, 1);
}
return $result;
}
?>