Esempio n. 1
0
         }
     }
 }
 // Sometimes the extra indicates the part of speech
 if (!$found) {
     $parts = split('\\.', $extra);
     if (count($parts) > 1) {
         $allPartsFound = true;
         foreach ($parts as $part) {
             $part = trim($part);
             if ($part) {
                 $part .= '.';
                 // Now look this part up in every definition.
                 $anyDef = false;
                 foreach ($defs as $def) {
                     $anyDef |= text_contains($def->internalRep, $part);
                 }
                 if (!$anyDef) {
                     $allPartsFound = false;
                 }
             }
         }
         if ($allPartsFound) {
             $found = true;
         }
     }
 }
 if ($found) {
     //print "Removing {$l->form}\t$extra\t$l->extra\n";
     $l->extra = '';
     $l->save();
Esempio n. 2
0
             $lexems[] = $lexem;
         }
         // Now associate every lexem with every definition
         foreach ($defs as $defAssoc) {
             foreach ($lexems as $lexemAssoc) {
                 LexemDefinitionMap::associate($lexemAssoc->id, $defAssoc->id);
             }
         }
     }
     foreach ($defs as $fixDef) {
         $fixFirstAt = strpos($fixDef->internalRep, '@');
         $fixSecondAt = strpos($fixDef->internalRep, '@', $fixFirstAt + 1);
         assert($fixFirstAt === 0);
         assert($fixSecondAt !== false);
         $fixText = trim(substr($fixDef->internalRep, $fixFirstAt + 1, $fixSecondAt - $fixFirstAt - 1));
         if (!text_contains($fixText, '-') && !text_contains($fixText, ' ') && str_replace('-', '', $normText) == text_unicodeToLower(text_removeAccents($fixText))) {
             $prevPos = 0;
             while (($pos = mb_strpos($normText, '-', $prevPos)) !== false) {
                 $fixText = text_insert($fixText, '-', $pos);
                 $prevPos = $pos + 1;
             }
             $fixDef->internalRep = substr($fixDef->internalRep, 0, $fixFirstAt + 1) . $fixText . substr($fixDef->internalRep, $fixSecondAt);
             $fixDef->htmlRep = text_htmlize($fixDef->internalRep);
             $fixDef->save();
             print "    [{$fixDef->internalRep}]\n";
         }
     }
     $split++;
     $l->delete();
     $foundHyphenation = true;
 }
Esempio n. 3
0
        if ($accentCount) {
            $newPos = mb_strpos($accented, "'");
            if ($position !== false && $position != $newPos) {
                $position = false;
                $break;
            }
            $position = $newPos;
        }
        $newForm = str_replace("'", "", $accented);
        if ($form !== false && $form != $newForm) {
            $position = false;
            $break;
        }
        $form = $newForm;
    }
    assert(!text_contains($lexem->form, "'"));
    if ($position !== false) {
        if ($lexem->form == $form . "re" && $lexem->modelType == 'F' && ($lexem->modelNumber == 107 || $lexem->modelNumber == 113) || $lexem->form == $form . "t" && $lexem->modelType == 'A' && $lexem->modelNumber == 2) {
            $lexem->form = mb_substr($lexem->form, 0, $position) . "'" . mb_substr($lexem->form, $position);
            //print "[{$lexem->form}] [$form] [$position]\n";
            $lexem->save();
            $lexem->regenerateParadigm();
            $fixed++;
        }
    }
    $seen++;
    if ($seen % 1000 == 0) {
        print "Seen: {$seen} lexems, fixed: {$fixed}\n";
    }
}
print "Seen: {$seen} lexems, fixed: {$fixed}\n";
Esempio n. 4
0
    $matchingLexicon = false;
    $biology = false;
    $appears = false;
    foreach ($defs as $def) {
        if (str_replace('î', 'â', $def->lexicon) == str_replace('î', 'â', $l->unaccented)) {
            $matchingLexicon = true;
        }
        $rep = text_unicodeToLower($def->internalRep);
        $rep = str_replace(array('$', '@', '%', '.', ',', '(', ')', ';', ':'), array('', '', '', '', '', '', '', '', ''), $rep);
        $words = split("[ \n\t]", $rep);
        foreach ($words as $word) {
            $biology |= in_array($word, $biologyTerms);
            $appears |= $l->unaccented == $word;
        }
    }
    if (!$matchingLexicon && !text_contains($l->form, ' ') && $biology) {
        if ($appears) {
            print "Changing {$l->id} {$l->form} to I2\n";
            $l->modelType = 'I';
            $l->modelNumber = '2';
            $l->restriction = '';
            $l->noAccent = true;
            $l->save();
            $l->regenerateParadigm();
        } else {
            print "DELETING {$l->id} {$l->form}\n";
            $l->delete();
        }
        $removed++;
    }
}
 if ($model->modelType != 'MF' && $model->modelType != 'VT' && $model->id != $tempModel->id && $model->id == 130) {
     print "Testing model " . $model->getName() . " (id = " . $model->id . ")\n";
     $lexems = Lexem::loadByModelId($model->id);
     $transfMap = array();
     foreach ($lexems as $lexem) {
         if (array_key_exists($lexem->id, $knownBadLexems)) {
             print "  Skipping known bad lexem " . $lexem->id . " (" . $lexem->unaccented . ")\n";
             continue;
         }
         if (!$lexem->isLoc) {
             continue;
         }
         $wls = WordList::loadByLexemId($lexem->id);
         $ignore = false;
         for ($i = 0; $i < count($wls) && !$ignore; $i++) {
             $ignore = ord($wls[$i]->form) == 0 || text_contains($wls[$i]->form, "'");
         }
         if ($ignore) {
             print "  Ignoring lexem " . $lexem->id . " (" . $lexem->unaccented . "): paradigm contains accents or null characters\n";
         }
         if (!$ignore) {
             if (!count($transfMap)) {
                 // Create model_descriptions by comparing the first lexem to its
                 // existing wordlists.
                 print "  Using lexem '" . $lexem->unaccented . "' as exponent\n";
                 foreach ($wls as $wl) {
                     $transforms = text_extractTransforms($lexem->unaccented, $wl->form);
                     $transfMap[$wl->inflectionId] = $transforms;
                 }
                 // Dump the transformation table
                 print "  Transforms:\n";
Esempio n. 6
0
function saveLexemWithExceptions($l)
{
    $exceptions = array(array('administratoare', 'MF', '66', ''), array('ăllalt', 'P', '23', ''), array('ălalalt', 'P', '23', ''), array('beși', 'V', '319', ''), array('câtea', 'F', '151', ''), array('celalalt', 'P', '31', ''), array('cellalt', 'P', '31', ''), array('greață', 'F', '38', ''), array('istalalt', 'P', '63', ''), array('istălalt', 'P', '63', ''), array('înși', 'MF', '4', ''), array('mielea', 'P', '74', ''), array('oară', 'MF', '28', ''), array('oare', 'MF', '66', ''), array('scrabble', 'N', '76', ''), array('voi', 'N', '67', 'pron.'));
    foreach ($exceptions as $e) {
        if ($l->unaccented == $e[0] && $l->modelType == $e[1] && $l->modelNumber == $e[2] && (!$e[3] || text_contains($l->extra, $e[3]))) {
            return;
        }
    }
    $forced = array(array("at'ât", 'I', "at'âta", 'P', '20', ''), array("fi", 'I', '', 'V', '339', ''), array("la", 'I', '', 'VT', '99', ''), array("mult", 'I', '', 'P', '76', ''), array("r'umpe", 'VT', '', 'VT', "657'", ''), array("cor'umpe", 'VT', '', 'VT', "657'", ''), array("întrer'umpe", 'VT', '', 'VT', "657'", ''), array("ir'umpe", 'V', '', 'V', "657'", ''), array("tot", 'I', '', 'P', '98', ''));
    foreach ($forced as $f) {
        if ($l->form == $f[0] && $l->modelType == $f[1]) {
            if ($f[2]) {
                $l->form = $f[2];
                $l->unaccented = str_replace("'", '', $l->form);
                $l->reverse = text_reverse($l->unaccented);
            }
            $l->modelType = $f[3];
            $l->modelNumber = $f[4];
            $l->restriction = $f[5];
        }
    }
    $l->save();
}
Esempio n. 7
0
 $query = "select form, infl_id, variant, is_baseform from dmlr_models " . "where model_type = '{$model->modelType}' " . "and model_no = '" . addslashes($model->number) . "' order by infl_id";
 $dmlrDbResult = logged_query($query);
 $results = db_getArray($dmlrDbResult);
 $baseForm = null;
 foreach ($results as $row) {
     $form = $row['form'];
     $variant = $row['variant'];
     $inflId = $row['infl_id'];
     $isBaseForm = $row['is_baseform'];
     if ($baseForm && $isBaseForm) {
         die("Incorrect baseform for {$model->modelType}{$model->number}\n");
     }
     if (!$baseForm) {
         $baseForm = $form;
     }
     if (text_contains($baseForm, "'") ^ text_contains($form, "'")) {
         print "Incomplete accents for {$baseForm} => {$form}\n";
     }
     //print "$baseForm=>$form\n";
     if (!text_validateAlphabet($form, "aăâbcdefghiîjklmnopqrsștțuvwxyz'")) {
         die("Illegal characters in form {$form}\n");
     }
     $transforms = text_extractTransforms($baseForm, $form, $model->modelType == 'P');
     assert(count($transforms) >= 2);
     // Split off the last transform: it indicates the accent shift
     $accentShift = array_pop($transforms);
     if ($accentShift != UNKNOWN_ACCENT_SHIFT && $accentShift != NO_ACCENT_SHIFT) {
         $accentedVowel = array_pop($transforms);
     } else {
         $accentedVowel = '';
     }