<?php

// This is nowhere near perfect, but it's a decent approximation for starters.
// Gives each lexem a frequency between 0.00 and 1.00
// Stop words defined in stringUtil.php get 1.00
// Other lexems get frequencies distributed uniformly between 0.01 and 1.00 based on their percentile rankings in the full text index.
require_once '../phplib/util.php';
ini_set('max_execution_time', '3600');
ini_set('memory_limit', '256M');
assert_options(ASSERT_BAIL, 1);
log_scriptLog('Running rebuildLexemFrequencies.php.');
log_scriptLog('Setting frequency to 1.00 for manual stop words');
foreach (StringUtil::$STOPWORDS as $sw) {
    $lexems = Lexem::get_all_by_formNoAccent($sw);
    foreach ($lexems as $l) {
        $l->frequency = 1.0;
        $l->save();
    }
}
log_scriptLog("Scanning full text index");
$dbResult = db_execute("select lexemId from FullTextIndex group by lexemId order by count(*)");
$numLexems = $dbResult->rowCount();
$i = 0;
foreach ($dbResult as $row) {
    $lexem = Lexem::get_by_id($row[0]);
    $lexem->frequency = round($i / $numLexems + 0.005, 2);
    $lexem->save();
    $i++;
    if ($i % 10000 == 0) {
        log_scriptLog("{$i} of {$numLexems} labeled");
    }
Ejemplo n.º 2
0
 $names = preg_split("/[-\\s,\\/()]+/", $lname);
 foreach ($names as $name) {
     if ($name == '') {
         continue;
     }
     if (isset($excludeChar) && $name[0] == $excludeChar) {
         continue;
     }
     $name = str_replace("'", '', $name);
     $name = str_replace("\\", '', $name);
     if ($verbose) {
         echo "\t * Process part: '{$name}'\n";
     }
     $lexems = Lexem::get_all_by_form($name);
     if (!count($lexems)) {
         $lexems = Lexem::get_all_by_formNoAccent($name);
     }
     if ($allowInflected) {
         if (!count($lexems)) {
             $lexems = Model::factory('Lexem')->table_alias('l')->select('l.*')->join('LexemModel', 'l.id = lm.lexemId', 'lm')->join('InflectedForm', 'l.id = i.lexemModelId', 'i')->where('i.formNoAccent', $name)->find_many();
             if (count($lexems)) {
                 if ($verbose) {
                     echo "\t\tFound inflected form {$name} for lexem {$lexems[0]->id} ({$lexems[0]->form})\n";
                 }
             }
         }
     }
     // procedura de refolosire a lexemului sau de regenerare
     if (count($lexems)) {
         // Reuse existing lexem.
         $lexem = $lexems[0];
Ejemplo n.º 3
0
 /**
  * Delete lexems that do not have their own definitions.
  * Arguments for participles: 'A', ($adjectiveModel).
  * Arguments for long infinitives: 'F', ('107', '113').
  */
 private function _deleteDependentModels($inflId, $modelType, $modelNumbers)
 {
     // Load and hash all the definitionIds
     $ldms = LexemDefinitionMap::get_all_by_lexemId($this->id);
     $defHash = array();
     foreach ($ldms as $ldm) {
         $defHash[$ldm->definitionId] = true;
     }
     // Iterate through all the forms of the desired inflection (participle / long infinitive)
     foreach ($this->getLexemModels() as $lm) {
         $ifs = InflectedForm::get_all_by_lexemModelId_inflectionId($lm->id, $inflId);
         foreach ($ifs as $if) {
             // Examine all lexems having one of the above forms
             $lexems = Lexem::get_all_by_formNoAccent($if->formNoAccent);
             foreach ($lexems as $l) {
                 // Keep only the ones that have acceptable model types/numbers
                 $acceptable = false;
                 foreach ($l->getLexemModels() as $o) {
                     if ($o->modelType == 'T' || $o->modelType == $modelType && in_array($o->modelNumber, $modelNumbers)) {
                         $acceptable = true;
                     }
                 }
                 // If $l has the right model, delete it unless it has its own definitions
                 if ($acceptable) {
                     $ownDefinitions = false;
                     $ldms = LexemDefinitionMap::get_all_by_lexemId($l->id);
                     foreach ($ldms as $ldm) {
                         if (!array_key_exists($ldm->definitionId, $defHash)) {
                             $ownDefinitions = true;
                         }
                     }
                     if (!$ownDefinitions) {
                         FlashMessage::add("Am șters automat lexemul {$l->formNoAccent}.", 'info');
                         $l->delete();
                     }
                 }
             }
         }
     }
 }
Ejemplo n.º 4
0
 /**
  * Delete lexems that do not have their own definitions.
  * Arguments for participles: 'A', ($adjectiveModel).
  * Arguments for long infinitives: 'F', ('107', '113').
  */
 private function _deleteDependentModels($inflId, $modelType, $modelNumbers)
 {
     $ifs = Model::factory('InflectedForm')->where('lexemId', $this->id)->where('inflectionId', $inflId)->find_many();
     $ldms = LexemDefinitionMap::get_all_by_lexemId($this->id);
     $defHash = array();
     foreach ($ldms as $ldm) {
         $defHash[$ldm->definitionId] = true;
     }
     foreach ($ifs as $if) {
         $lexems = Lexem::get_all_by_formNoAccent($if->formNoAccent);
         foreach ($lexems as $l) {
             if ($l->modelType == 'T' || $l->modelType == $modelType && in_array($l->modelNumber, $modelNumbers)) {
                 $ownDefinitions = false;
                 $ldms = LexemDefinitionMap::get_all_by_lexemId($l->id);
                 foreach ($ldms as $ldm) {
                     if (!array_key_exists($ldm->definitionId, $defHash)) {
                         $ownDefinitions = true;
                     }
                 }
                 if (!$ownDefinitions) {
                     $l->delete();
                 }
             }
         }
     }
 }