} } } // Sometimes the extra indicates the part of speech if (!$found) { $parts = split('\\.', $extra); if (count($parts) > 1) { $allPartsFound = true; foreach ($parts as $part) { $part = trim($part); if ($part) { $part .= '.'; // Now look this part up in every definition. $anyDef = false; foreach ($defs as $def) { $anyDef |= text_contains($def->internalRep, $part); } if (!$anyDef) { $allPartsFound = false; } } } if ($allPartsFound) { $found = true; } } } if ($found) { //print "Removing {$l->form}\t$extra\t$l->extra\n"; $l->extra = ''; $l->save();
$lexems[] = $lexem; } // Now associate every lexem with every definition foreach ($defs as $defAssoc) { foreach ($lexems as $lexemAssoc) { LexemDefinitionMap::associate($lexemAssoc->id, $defAssoc->id); } } } foreach ($defs as $fixDef) { $fixFirstAt = strpos($fixDef->internalRep, '@'); $fixSecondAt = strpos($fixDef->internalRep, '@', $fixFirstAt + 1); assert($fixFirstAt === 0); assert($fixSecondAt !== false); $fixText = trim(substr($fixDef->internalRep, $fixFirstAt + 1, $fixSecondAt - $fixFirstAt - 1)); if (!text_contains($fixText, '-') && !text_contains($fixText, ' ') && str_replace('-', '', $normText) == text_unicodeToLower(text_removeAccents($fixText))) { $prevPos = 0; while (($pos = mb_strpos($normText, '-', $prevPos)) !== false) { $fixText = text_insert($fixText, '-', $pos); $prevPos = $pos + 1; } $fixDef->internalRep = substr($fixDef->internalRep, 0, $fixFirstAt + 1) . $fixText . substr($fixDef->internalRep, $fixSecondAt); $fixDef->htmlRep = text_htmlize($fixDef->internalRep); $fixDef->save(); print " [{$fixDef->internalRep}]\n"; } } $split++; $l->delete(); $foundHyphenation = true; }
if ($accentCount) { $newPos = mb_strpos($accented, "'"); if ($position !== false && $position != $newPos) { $position = false; $break; } $position = $newPos; } $newForm = str_replace("'", "", $accented); if ($form !== false && $form != $newForm) { $position = false; $break; } $form = $newForm; } assert(!text_contains($lexem->form, "'")); if ($position !== false) { if ($lexem->form == $form . "re" && $lexem->modelType == 'F' && ($lexem->modelNumber == 107 || $lexem->modelNumber == 113) || $lexem->form == $form . "t" && $lexem->modelType == 'A' && $lexem->modelNumber == 2) { $lexem->form = mb_substr($lexem->form, 0, $position) . "'" . mb_substr($lexem->form, $position); //print "[{$lexem->form}] [$form] [$position]\n"; $lexem->save(); $lexem->regenerateParadigm(); $fixed++; } } $seen++; if ($seen % 1000 == 0) { print "Seen: {$seen} lexems, fixed: {$fixed}\n"; } } print "Seen: {$seen} lexems, fixed: {$fixed}\n";
$matchingLexicon = false; $biology = false; $appears = false; foreach ($defs as $def) { if (str_replace('î', 'â', $def->lexicon) == str_replace('î', 'â', $l->unaccented)) { $matchingLexicon = true; } $rep = text_unicodeToLower($def->internalRep); $rep = str_replace(array('$', '@', '%', '.', ',', '(', ')', ';', ':'), array('', '', '', '', '', '', '', '', ''), $rep); $words = split("[ \n\t]", $rep); foreach ($words as $word) { $biology |= in_array($word, $biologyTerms); $appears |= $l->unaccented == $word; } } if (!$matchingLexicon && !text_contains($l->form, ' ') && $biology) { if ($appears) { print "Changing {$l->id} {$l->form} to I2\n"; $l->modelType = 'I'; $l->modelNumber = '2'; $l->restriction = ''; $l->noAccent = true; $l->save(); $l->regenerateParadigm(); } else { print "DELETING {$l->id} {$l->form}\n"; $l->delete(); } $removed++; } }
if ($model->modelType != 'MF' && $model->modelType != 'VT' && $model->id != $tempModel->id && $model->id == 130) { print "Testing model " . $model->getName() . " (id = " . $model->id . ")\n"; $lexems = Lexem::loadByModelId($model->id); $transfMap = array(); foreach ($lexems as $lexem) { if (array_key_exists($lexem->id, $knownBadLexems)) { print " Skipping known bad lexem " . $lexem->id . " (" . $lexem->unaccented . ")\n"; continue; } if (!$lexem->isLoc) { continue; } $wls = WordList::loadByLexemId($lexem->id); $ignore = false; for ($i = 0; $i < count($wls) && !$ignore; $i++) { $ignore = ord($wls[$i]->form) == 0 || text_contains($wls[$i]->form, "'"); } if ($ignore) { print " Ignoring lexem " . $lexem->id . " (" . $lexem->unaccented . "): paradigm contains accents or null characters\n"; } if (!$ignore) { if (!count($transfMap)) { // Create model_descriptions by comparing the first lexem to its // existing wordlists. print " Using lexem '" . $lexem->unaccented . "' as exponent\n"; foreach ($wls as $wl) { $transforms = text_extractTransforms($lexem->unaccented, $wl->form); $transfMap[$wl->inflectionId] = $transforms; } // Dump the transformation table print " Transforms:\n";
function saveLexemWithExceptions($l) { $exceptions = array(array('administratoare', 'MF', '66', ''), array('ăllalt', 'P', '23', ''), array('ălalalt', 'P', '23', ''), array('beși', 'V', '319', ''), array('câtea', 'F', '151', ''), array('celalalt', 'P', '31', ''), array('cellalt', 'P', '31', ''), array('greață', 'F', '38', ''), array('istalalt', 'P', '63', ''), array('istălalt', 'P', '63', ''), array('înși', 'MF', '4', ''), array('mielea', 'P', '74', ''), array('oară', 'MF', '28', ''), array('oare', 'MF', '66', ''), array('scrabble', 'N', '76', ''), array('voi', 'N', '67', 'pron.')); foreach ($exceptions as $e) { if ($l->unaccented == $e[0] && $l->modelType == $e[1] && $l->modelNumber == $e[2] && (!$e[3] || text_contains($l->extra, $e[3]))) { return; } } $forced = array(array("at'ât", 'I', "at'âta", 'P', '20', ''), array("fi", 'I', '', 'V', '339', ''), array("la", 'I', '', 'VT', '99', ''), array("mult", 'I', '', 'P', '76', ''), array("r'umpe", 'VT', '', 'VT', "657'", ''), array("cor'umpe", 'VT', '', 'VT', "657'", ''), array("întrer'umpe", 'VT', '', 'VT', "657'", ''), array("ir'umpe", 'V', '', 'V', "657'", ''), array("tot", 'I', '', 'P', '98', '')); foreach ($forced as $f) { if ($l->form == $f[0] && $l->modelType == $f[1]) { if ($f[2]) { $l->form = $f[2]; $l->unaccented = str_replace("'", '', $l->form); $l->reverse = text_reverse($l->unaccented); } $l->modelType = $f[3]; $l->modelNumber = $f[4]; $l->restriction = $f[5]; } } $l->save(); }
$query = "select form, infl_id, variant, is_baseform from dmlr_models " . "where model_type = '{$model->modelType}' " . "and model_no = '" . addslashes($model->number) . "' order by infl_id"; $dmlrDbResult = logged_query($query); $results = db_getArray($dmlrDbResult); $baseForm = null; foreach ($results as $row) { $form = $row['form']; $variant = $row['variant']; $inflId = $row['infl_id']; $isBaseForm = $row['is_baseform']; if ($baseForm && $isBaseForm) { die("Incorrect baseform for {$model->modelType}{$model->number}\n"); } if (!$baseForm) { $baseForm = $form; } if (text_contains($baseForm, "'") ^ text_contains($form, "'")) { print "Incomplete accents for {$baseForm} => {$form}\n"; } //print "$baseForm=>$form\n"; if (!text_validateAlphabet($form, "aăâbcdefghiîjklmnopqrsștțuvwxyz'")) { die("Illegal characters in form {$form}\n"); } $transforms = text_extractTransforms($baseForm, $form, $model->modelType == 'P'); assert(count($transforms) >= 2); // Split off the last transform: it indicates the accent shift $accentShift = array_pop($transforms); if ($accentShift != UNKNOWN_ACCENT_SHIFT && $accentShift != NO_ACCENT_SHIFT) { $accentedVowel = array_pop($transforms); } else { $accentedVowel = ''; }