Beispiel #1
0
function search($file, $word, &$statsList)
{
    $index = computeIndex($word);
    if ($index != -1) {
        fseek($file, $index * 4 + 4);
        // 4 bytes per entry, skip header
        $index = readInt($file);
        if ($index) {
            $start = sizeof($statsList);
            $count = $start;
            fseek($file, $index);
            $w = readString($file);
            while ($w) {
                $statIdx = readInt($file);
                if ($word == substr($w, 0, strlen($word))) {
                    // found word that matches (as substring)
                    $statsList[$count++] = array("word" => $word, "match" => $w, "index" => $statIdx, "full" => strlen($w) == strlen($word), "docs" => array());
                }
                $w = readString($file);
            }
            $totalFreq = 0;
            for ($count = $start; $count < sizeof($statsList); $count++) {
                $statInfo =& $statsList[$count];
                fseek($file, $statInfo["index"]);
                $numDocs = readInt($file);
                $docInfo = array();
                // read docs info + occurrence frequency of the word
                $totalFreq = 0;
                for ($i = 0; $i < $numDocs; $i++) {
                    $idx = readInt($file);
                    $freq = readInt($file);
                    $docInfo[$i] = array("idx" => $idx, "freq" => $freq, "rank" => 0.0);
                    $totalFreq += $freq;
                    if ($statInfo["full"]) {
                        $totalFreq += $freq;
                    }
                }
                // read name an url info for the doc
                for ($i = 0; $i < $numDocs; $i++) {
                    fseek($file, $docInfo[$i]["idx"]);
                    $docInfo[$i]["name"] = readString($file);
                    $docInfo[$i]["url"] = readString($file);
                }
                $statInfo["docs"] = $docInfo;
            }
            for ($count = $start; $count < sizeof($statsList); $count++) {
                $statInfo =& $statsList[$count];
                for ($i = 0; $i < sizeof($statInfo["docs"]); $i++) {
                    $docInfo =& $statInfo["docs"];
                    // compute frequency rank of the word in each doc
                    $statInfo["docs"][$i]["rank"] = (double) $docInfo[$i]["freq"] / $totalFreq;
                }
            }
        }
    }
    return $statsList;
}
Beispiel #2
0
function search($file, $word, &$statsList)
{
    $index = computeIndex($word);
    if ($index != -1) {
        fseek($file, $index * 4 + 4);
        // 4 bytes per entry, skip header
        $index = readInt($file);
        if ($index) {
            $start = sizeof($statsList);
            $count = $start;
            fseek($file, $index);
            $w = readString($file);
            while ($w) {
                $statIdx = readInt($file);
                if ($word == substr($w, 0, strlen($word))) {
                    // found word that matches (as substring)
                    $statsList[$count++] = array("word" => $word, "match" => $w, "index" => $statIdx, "full" => strlen($w) == strlen($word), "docs" => array());
                }
                $w = readString($file);
            }
            $totalHi = 0;
            $totalFreqHi = 0;
            $totalFreqLo = 0;
            for ($count = $start; $count < sizeof($statsList); $count++) {
                $statInfo =& $statsList[$count];
                $multiplier = 1;
                // whole word matches have a double weight
                if ($statInfo["full"]) {
                    $multiplier = 2;
                }
                fseek($file, $statInfo["index"]);
                $numDocs = readInt($file);
                $docInfo = array();
                // read docs info + occurrence frequency of the word
                for ($i = 0; $i < $numDocs; $i++) {
                    $idx = readInt($file);
                    $freq = readInt($file);
                    $docInfo[$i] = array("idx" => $idx, "freq" => $freq >> 1, "rank" => 0.0, "hi" => $freq & 1);
                    if ($freq & 1) {
                        $totalHi++;
                        $totalFreqHi += $freq * $multiplier;
                    } else {
                        $totalFreqLo += $freq * $multiplier;
                    }
                }
                // read name and url info for the doc
                for ($i = 0; $i < $numDocs; $i++) {
                    fseek($file, $docInfo[$i]["idx"]);
                    $docInfo[$i]["name"] = readString($file);
                    $docInfo[$i]["url"] = readString($file);
                }
                $statInfo["docs"] = $docInfo;
            }
            $totalFreq = ($totalHi + 1) * $totalFreqLo + $totalFreqHi;
            for ($count = $start; $count < sizeof($statsList); $count++) {
                $statInfo =& $statsList[$count];
                $multiplier = 1;
                // whole word matches have a double weight
                if ($statInfo["full"]) {
                    $multiplier = 2;
                }
                for ($i = 0; $i < sizeof($statInfo["docs"]); $i++) {
                    $docInfo =& $statInfo["docs"];
                    // compute frequency rank of the word in each doc
                    $freq = $docInfo[$i]["freq"];
                    if ($docInfo[$i]["hi"]) {
                        $statInfo["docs"][$i]["rank"] = (double) ($freq * $multiplier + $totalFreqLo) / $totalFreq;
                    } else {
                        $statInfo["docs"][$i]["rank"] = (double) ($freq * $multiplier) / $totalFreq;
                    }
                }
            }
        }
    }
    return $statsList;
}