function search($file, $word, &$statsList) { $index = computeIndex($word); if ($index != -1) { fseek($file, $index * 4 + 4); // 4 bytes per entry, skip header $index = readInt($file); if ($index) { $start = sizeof($statsList); $count = $start; fseek($file, $index); $w = readString($file); while ($w) { $statIdx = readInt($file); if ($word == substr($w, 0, strlen($word))) { // found word that matches (as substring) $statsList[$count++] = array("word" => $word, "match" => $w, "index" => $statIdx, "full" => strlen($w) == strlen($word), "docs" => array()); } $w = readString($file); } $totalFreq = 0; for ($count = $start; $count < sizeof($statsList); $count++) { $statInfo =& $statsList[$count]; fseek($file, $statInfo["index"]); $numDocs = readInt($file); $docInfo = array(); // read docs info + occurrence frequency of the word $totalFreq = 0; for ($i = 0; $i < $numDocs; $i++) { $idx = readInt($file); $freq = readInt($file); $docInfo[$i] = array("idx" => $idx, "freq" => $freq, "rank" => 0.0); $totalFreq += $freq; if ($statInfo["full"]) { $totalFreq += $freq; } } // read name an url info for the doc for ($i = 0; $i < $numDocs; $i++) { fseek($file, $docInfo[$i]["idx"]); $docInfo[$i]["name"] = readString($file); $docInfo[$i]["url"] = readString($file); } $statInfo["docs"] = $docInfo; } for ($count = $start; $count < sizeof($statsList); $count++) { $statInfo =& $statsList[$count]; for ($i = 0; $i < sizeof($statInfo["docs"]); $i++) { $docInfo =& $statInfo["docs"]; // compute frequency rank of the word in each doc $statInfo["docs"][$i]["rank"] = (double) $docInfo[$i]["freq"] / $totalFreq; } } } } return $statsList; }
function search($file, $word, &$statsList) { $index = computeIndex($word); if ($index != -1) { fseek($file, $index * 4 + 4); // 4 bytes per entry, skip header $index = readInt($file); if ($index) { $start = sizeof($statsList); $count = $start; fseek($file, $index); $w = readString($file); while ($w) { $statIdx = readInt($file); if ($word == substr($w, 0, strlen($word))) { // found word that matches (as substring) $statsList[$count++] = array("word" => $word, "match" => $w, "index" => $statIdx, "full" => strlen($w) == strlen($word), "docs" => array()); } $w = readString($file); } $totalHi = 0; $totalFreqHi = 0; $totalFreqLo = 0; for ($count = $start; $count < sizeof($statsList); $count++) { $statInfo =& $statsList[$count]; $multiplier = 1; // whole word matches have a double weight if ($statInfo["full"]) { $multiplier = 2; } fseek($file, $statInfo["index"]); $numDocs = readInt($file); $docInfo = array(); // read docs info + occurrence frequency of the word for ($i = 0; $i < $numDocs; $i++) { $idx = readInt($file); $freq = readInt($file); $docInfo[$i] = array("idx" => $idx, "freq" => $freq >> 1, "rank" => 0.0, "hi" => $freq & 1); if ($freq & 1) { $totalHi++; $totalFreqHi += $freq * $multiplier; } else { $totalFreqLo += $freq * $multiplier; } } // read name and url info for the doc for ($i = 0; $i < $numDocs; $i++) { fseek($file, $docInfo[$i]["idx"]); $docInfo[$i]["name"] = readString($file); $docInfo[$i]["url"] = readString($file); } $statInfo["docs"] = $docInfo; } $totalFreq = ($totalHi + 1) * $totalFreqLo + $totalFreqHi; for ($count = $start; $count < sizeof($statsList); $count++) { $statInfo =& $statsList[$count]; $multiplier = 1; // whole word matches have a double weight if ($statInfo["full"]) { $multiplier = 2; } for ($i = 0; $i < sizeof($statInfo["docs"]); $i++) { $docInfo =& $statInfo["docs"]; // compute frequency rank of the word in each doc $freq = $docInfo[$i]["freq"]; if ($docInfo[$i]["hi"]) { $statInfo["docs"][$i]["rank"] = (double) ($freq * $multiplier + $totalFreqLo) / $totalFreq; } else { $statInfo["docs"][$i]["rank"] = (double) ($freq * $multiplier) / $totalFreq; } } } } } return $statsList; }