Пример #1
0
                foreach ($defs as $defAssoc) {
                    foreach ($lexems as $lexemAssoc) {
                        LexemDefinitionMap::associate($lexemAssoc->id, $defAssoc->id);
                    }
                }
            }
            foreach ($defs as $fixDef) {
                $fixFirstAt = strpos($fixDef->internalRep, '@');
                $fixSecondAt = strpos($fixDef->internalRep, '@', $fixFirstAt + 1);
                assert($fixFirstAt === 0);
                assert($fixSecondAt !== false);
                $fixText = trim(substr($fixDef->internalRep, $fixFirstAt + 1, $fixSecondAt - $fixFirstAt - 1));
                if (!text_contains($fixText, '-') && !text_contains($fixText, ' ') && str_replace('-', '', $normText) == text_unicodeToLower(text_removeAccents($fixText))) {
                    $prevPos = 0;
                    while (($pos = mb_strpos($normText, '-', $prevPos)) !== false) {
                        $fixText = text_insert($fixText, '-', $pos);
                        $prevPos = $pos + 1;
                    }
                    $fixDef->internalRep = substr($fixDef->internalRep, 0, $fixFirstAt + 1) . $fixText . substr($fixDef->internalRep, $fixSecondAt);
                    $fixDef->htmlRep = text_htmlize($fixDef->internalRep);
                    $fixDef->save();
                    print "    [{$fixDef->internalRep}]\n";
                }
            }
            $split++;
            $l->delete();
            $foundHyphenation = true;
        }
    }
    //print "NOT OK: {$l->unaccented}\n";
}
Пример #2
0
function parseWordField($word, $modelType, $modelNo, $restr)
{
    $word = trim($word);
    // Look for a slash not included in brackets
    $len = mb_strlen($word);
    $parCount = 0;
    $i = 0;
    $found = false;
    while ($i < $len && !$found) {
        $c = text_getCharAt($word, $i);
        if ($c == '[' || $c == '(') {
            $parCount++;
        } else {
            if ($c == ']' || $c == ')') {
                $parCount--;
            }
        }
        if ($c == '/' && !$parCount) {
            $found = true;
        } else {
            $i++;
        }
    }
    if ($found) {
        $r1 = parseWordField(mb_substr($word, 0, $i), $modelType, $modelNo, $restr);
        $r2 = parseWordField(mb_substr($word, $i + 1), $modelType, $modelNo, $restr);
        return array_merge($r1, $r2);
    }
    if (text_endsWith($word, ']')) {
        $pos = mb_strrpos($word, '[');
        assert($pos !== false);
        $extra = mb_substr($word, $pos);
        $results = parseWordField(mb_substr($word, 0, $pos), $modelType, $modelNo, $restr);
        assert(count($results));
        appendExtra($results[count($results) - 1], $extra);
        return $results;
    }
    if (text_endsWith($word, ')')) {
        $pos = mb_strrpos($word, '(');
        assert($pos !== false);
        $extra = mb_substr($word, $pos);
        $results = parseWordField(mb_substr($word, 0, $pos), $modelType, $modelNo, $restr);
        assert(count($results));
        // See if $extra contains a model number. If so, use it on the last model.
        list($modelType, $modelNo, $restr) = parseModel($extra);
        if ($modelType && $modelNo) {
            $results[count($results) - 1]->modelType = $modelType;
            $results[count($results) - 1]->modelNumber = $modelNo;
            $results[count($results) - 1]->restriction = $restr;
        }
        appendExtra($results[count($results) - 1], $extra);
        // If $extra dictates a part of speech, apply it to all the lexems
        if (text_contains($extra, 's.f.inv.') || text_contains($extra, 's.f. în expr.') || text_contains($extra, 's.m.inv.') || text_contains($extra, 's.n.inv.') || text_contains($extra, 'adj.inv.') || text_contains($extra, 'adv.') || text_contains($extra, 'conj.') || text_contains($extra, 'prep.') || text_contains($extra, 'interj.')) {
            foreach ($results as $l) {
                $l->modelType = 'I';
                $l->modelNumber = '1';
                $l->restriction = '';
            }
        }
        return $results;
    }
    $parts = split(',', $word);
    if (count($parts) >= 2) {
        $results = array();
        foreach ($parts as $part) {
            $results = array_merge($results, parseWordField($part, $modelType, $modelNo, $restr));
        }
        return $results;
    }
    $extra = text_contains($word, '-') ? $word : '';
    $word = str_replace('-', '', $word);
    $len = mb_strlen($word);
    $found = false;
    for ($i = 0; $i < $len && !$found; $i++) {
        $c = text_getCharAt($word, $i);
        if (text_isLowercase($c)) {
            $found = true;
            $word = text_insert($word, "'", $i);
        }
    }
    $word = text_unicodeToLower($word);
    $l = Lexem::create($word, $modelType, $modelNo, $restr);
    appendExtra($l, $extra);
    $l->isLoc = true;
    return array($l);
}