<?php

// This is nowhere near perfect, but it's a decent approximation for starters.
// Gives each lexem a frequency between 0.00 and 1.00
// Stop words defined in stringUtil.php get 1.00
// Other lexems get frequencies distributed uniformly between 0.01 and 1.00 based on their percentile rankings in the full text index.
require_once __DIR__ . '/../phplib/util.php';
ini_set('max_execution_time', '3600');
ini_set('memory_limit', '256M');
assert_options(ASSERT_BAIL, 1);
log_scriptLog('Running rebuildLexemFrequencies.php.');
log_scriptLog('Setting frequency to 1.00 for manual stop words');
$lexems = Lexem::get_all_by_stopWord(1);
foreach ($lexems as $l) {
    $l->frequency = 1.0;
    $l->save();
}
log_scriptLog("Scanning full text index");
$dbResult = db_execute("select lexemId from FullTextIndex group by lexemId order by count(*)");
$numLexems = $dbResult->rowCount();
$i = 0;
foreach ($dbResult as $row) {
    $lexem = Lexem::get_by_id($row[0]);
    $lexem->frequency = round($i / $numLexems + 0.005, 2);
    $lexem->save();
    $i++;
    if ($i % 10000 == 0) {
        log_scriptLog("{$i} of {$numLexems} labeled");
    }
}
log_scriptLog('rebuildLexemFrequencies.php completed successfully');