<?php // This is nowhere near perfect, but it's a decent approximation for starters. // Gives each lexem a frequency between 0.00 and 1.00 // Stop words defined in stringUtil.php get 1.00 // Other lexems get frequencies distributed uniformly between 0.01 and 1.00 based on their percentile rankings in the full text index. require_once '../phplib/util.php'; ini_set('max_execution_time', '3600'); ini_set('memory_limit', '256M'); assert_options(ASSERT_BAIL, 1); log_scriptLog('Running rebuildLexemFrequencies.php.'); log_scriptLog('Setting frequency to 1.00 for manual stop words'); foreach (StringUtil::$STOPWORDS as $sw) { $lexems = Lexem::get_all_by_formNoAccent($sw); foreach ($lexems as $l) { $l->frequency = 1.0; $l->save(); } } log_scriptLog("Scanning full text index"); $dbResult = db_execute("select lexemId from FullTextIndex group by lexemId order by count(*)"); $numLexems = $dbResult->rowCount(); $i = 0; foreach ($dbResult as $row) { $lexem = Lexem::get_by_id($row[0]); $lexem->frequency = round($i / $numLexems + 0.005, 2); $lexem->save(); $i++; if ($i % 10000 == 0) { log_scriptLog("{$i} of {$numLexems} labeled"); }
$names = preg_split("/[-\\s,\\/()]+/", $lname); foreach ($names as $name) { if ($name == '') { continue; } if (isset($excludeChar) && $name[0] == $excludeChar) { continue; } $name = str_replace("'", '', $name); $name = str_replace("\\", '', $name); if ($verbose) { echo "\t * Process part: '{$name}'\n"; } $lexems = Lexem::get_all_by_form($name); if (!count($lexems)) { $lexems = Lexem::get_all_by_formNoAccent($name); } if ($allowInflected) { if (!count($lexems)) { $lexems = Model::factory('Lexem')->table_alias('l')->select('l.*')->join('LexemModel', 'l.id = lm.lexemId', 'lm')->join('InflectedForm', 'l.id = i.lexemModelId', 'i')->where('i.formNoAccent', $name)->find_many(); if (count($lexems)) { if ($verbose) { echo "\t\tFound inflected form {$name} for lexem {$lexems[0]->id} ({$lexems[0]->form})\n"; } } } } // procedura de refolosire a lexemului sau de regenerare if (count($lexems)) { // Reuse existing lexem. $lexem = $lexems[0];
/** * Delete lexems that do not have their own definitions. * Arguments for participles: 'A', ($adjectiveModel). * Arguments for long infinitives: 'F', ('107', '113'). */ private function _deleteDependentModels($inflId, $modelType, $modelNumbers) { // Load and hash all the definitionIds $ldms = LexemDefinitionMap::get_all_by_lexemId($this->id); $defHash = array(); foreach ($ldms as $ldm) { $defHash[$ldm->definitionId] = true; } // Iterate through all the forms of the desired inflection (participle / long infinitive) foreach ($this->getLexemModels() as $lm) { $ifs = InflectedForm::get_all_by_lexemModelId_inflectionId($lm->id, $inflId); foreach ($ifs as $if) { // Examine all lexems having one of the above forms $lexems = Lexem::get_all_by_formNoAccent($if->formNoAccent); foreach ($lexems as $l) { // Keep only the ones that have acceptable model types/numbers $acceptable = false; foreach ($l->getLexemModels() as $o) { if ($o->modelType == 'T' || $o->modelType == $modelType && in_array($o->modelNumber, $modelNumbers)) { $acceptable = true; } } // If $l has the right model, delete it unless it has its own definitions if ($acceptable) { $ownDefinitions = false; $ldms = LexemDefinitionMap::get_all_by_lexemId($l->id); foreach ($ldms as $ldm) { if (!array_key_exists($ldm->definitionId, $defHash)) { $ownDefinitions = true; } } if (!$ownDefinitions) { FlashMessage::add("Am șters automat lexemul {$l->formNoAccent}.", 'info'); $l->delete(); } } } } } }
/** * Delete lexems that do not have their own definitions. * Arguments for participles: 'A', ($adjectiveModel). * Arguments for long infinitives: 'F', ('107', '113'). */ private function _deleteDependentModels($inflId, $modelType, $modelNumbers) { $ifs = Model::factory('InflectedForm')->where('lexemId', $this->id)->where('inflectionId', $inflId)->find_many(); $ldms = LexemDefinitionMap::get_all_by_lexemId($this->id); $defHash = array(); foreach ($ldms as $ldm) { $defHash[$ldm->definitionId] = true; } foreach ($ifs as $if) { $lexems = Lexem::get_all_by_formNoAccent($if->formNoAccent); foreach ($lexems as $l) { if ($l->modelType == 'T' || $l->modelType == $modelType && in_array($l->modelNumber, $modelNumbers)) { $ownDefinitions = false; $ldms = LexemDefinitionMap::get_all_by_lexemId($l->id); foreach ($ldms as $ldm) { if (!array_key_exists($ldm->definitionId, $defHash)) { $ownDefinitions = true; } } if (!$ownDefinitions) { $l->delete(); } } } } }