Example #1
0
function internalizeLexicon($name)
{
    $name = text_shorthandToUnicode($name);
    $name = str_replace(array('á', 'Á', 'ắ', 'Ắ', 'ấ', 'Ấ', 'é', 'É', 'í', 'Í', 'î́', 'Î́', 'ó', 'Ó', 'ú', 'Ú', 'ý', 'Ý'), array("'a", "'A", "'ă", "'Ă", "'â", "'Â", "'e", "'E", "'i", "'I", "'î", "'Î", "'o", "'O", "'u", "'U", "'y", "'Y"), $name);
    //$name = text_removeAccents($name);
    $name = trim($name);
    $name = strip_tags($name);
    $name = text_unicodeToLower($name);
    // Strip HTML escape codes
    $name = preg_replace("/&[^;]+;/", "", $name);
    // Strip all illegal characters
    $result = '';
    $len = mb_strlen($name);
    for ($i = 0; $i < $len; $i++) {
        $c = text_getCharAt($name, $i);
        if (strstr(' !@#$%^&*()-_+=\\|[]{},.<>/?;:"`~0123456789', $c) === FALSE) {
            $result .= $c;
        }
    }
    return $result;
}
Example #2
0
/**
 * Change the case of letters in $word to match those in $like
 **/
function matchCase($word, $like)
{
    $len = min(mb_strlen($word), mb_strlen($like));
    for ($i = 0; $i < $len; $i++) {
        $cWord = text_getCharAt($word, $i);
        $cLike = text_getCharAt($like, $i);
        if (text_isUppercase($cLike)) {
            $word = mb_substr($word, 0, $i) . text_unicodeToUpper($cWord) . mb_substr($word, $i + 1);
        } else {
            $word = mb_substr($word, 0, $i) . text_unicodeToLower($cWord) . mb_substr($word, $i + 1);
        }
    }
    return $word;
}
Example #3
0
<?php

require_once '../../phplib/util.php';
$defIds = file('defIds.txt');
foreach ($defIds as $i => $defId) {
    $defId = trim($defId);
    if (!$defId) {
        continue;
    }
    $def = Definition::load($defId);
    print $i + 1 . "/" . count($defIds) . " {$defId}\n";
    print "{$def->internalRep}\n";
    $newRep = '';
    $len = mb_strlen($def->internalRep);
    for ($i = 0; $i < $len; $i++) {
        $c = text_getCharAt($def->internalRep, $i);
        if ($c == '|') {
            $mid = mb_strpos($def->internalRep, '|', $i + 1);
            $close = mb_strpos($def->internalRep, '|', $mid + 1);
            $text = mb_substr($def->internalRep, $i + 1, $mid - $i - 1);
            $ref = mb_substr($def->internalRep, $mid + 1, $close - $mid - 1);
            print "|{$text}|{$ref}|\n";
            $i = $close;
            $c = readChar();
            if ($c == 'k') {
                $newRep .= "|{$text}|{$ref}|";
            } else {
                if ($c == 'd') {
                    $newRep .= $text;
                }
            }
Example #4
0
function myConvert($s)
{
    $map = array('&lt;' => '<', '&lt' => '<', '&gt;' => '>', '&#x113;' => 'ē', '&#37;' => '\\%', '&#x25;' => '\\%', '&#x0025;' => '\\%', '&#x7e;' => '\\~', '&#x27;' => "\\'", '&#39;' => "\\'", '&rsquo;' => "\\'", '&#xB4;' => "\\'", '&#x301;' => "\\'", '&#8220;' => '"', '&#8221;' => '"', '&#8222;' => '"', '&#x2a;' => '\\*', '&#x2A;' => '\\*', '&#x002A;' => '\\*', '&#x002a;' => '\\*', '&#9674;' => '*', '&#9830;' => '**', '&#064;' => '\\@', ',c' => ', c', ',C' => ', C', ',s' => ', s', ',S' => ', S', ',t' => ', t', ',T' => ', T', '&#8211;' => '-', '&#8212;' => '-');
    $s = str_replace(array_keys($map), array_values($map), $s);
    $len = mb_strlen($s);
    $state = 0;
    // 0 = normal, 1 = after &, 2 = &#
    $chunk = '';
    $result = '';
    for ($i = 0; $i < $len; $i++) {
        $char = text_getCharAt($s, $i);
        if ($state == 0) {
            if ($char == '&') {
                $chunk = $char;
                $state = 1;
            } else {
                $result .= $char;
                $chunk = '';
            }
        } else {
            if ($state == 1) {
                if ($char == '#') {
                    $chunk .= $char;
                    $state = 2;
                } else {
                    if ($char == '&') {
                        $result .= $chunk;
                        $chunk = $char;
                    } else {
                        $result .= $chunk;
                        $result .= $char;
                        $state = 0;
                        $chunk = '';
                    }
                }
            } else {
                if ($state == 2) {
                    if ($char == ';') {
                        $chunk .= $char;
                        $result .= processSequence($chunk);
                        $chunk = '';
                        $state = 0;
                    } else {
                        if ($char == '&') {
                            $result .= $chunk;
                            $chunk = $char;
                            $state = 1;
                        } else {
                            $chunk .= $char;
                        }
                    }
                } else {
                    print "ERROR!\n";
                    exit(1);
                }
            }
        }
    }
    $result .= $chunk;
    return $result;
}
Example #5
0
         $ambiguousLexems = true;
     } else {
         $noLexems = true;
     }
 }
 $rep = $def->internalRep;
 $len = mb_strlen($rep);
 $newRep = '';
 $prevC = '';
 $curInflection = 0;
 //print "Examining {$def->internalRep}\n";
 for ($i = 0; $i < $len; $i++) {
     $c = text_getCharAt($rep, $i);
     if (!text_isUnicodeLetter($prevC) && $c == '-' && $i <= MAX_LEN) {
         $j = $i + 1;
         while (text_isUnicodeLetter(text_getCharAt($rep, $j))) {
             $j++;
         }
         $chunk = mb_substr($rep, $i, $j - $i);
         if ($chunk != '-') {
             $suffix = mb_substr($chunk, 1);
             //print "{$def->id} [{$def->lexicon}] $i [$chunk]\n";
             if ($lexem) {
                 $matchingForm = null;
                 foreach ($inflections as $inflId) {
                     $wls = WordList::loadByLexemIdInflectionId($lexem->id, $inflId);
                     foreach ($wls as $wl) {
                         if (matchesWithAccent($wl->form, $suffix)) {
                             $matchingForm = $wl->form;
                             //print "Matching [{$wl->form}] to [$chunk]\n";
                         }
Example #6
0
function parseWordField($word, $modelType, $modelNo, $restr)
{
    $word = trim($word);
    // Look for a slash not included in brackets
    $len = mb_strlen($word);
    $parCount = 0;
    $i = 0;
    $found = false;
    while ($i < $len && !$found) {
        $c = text_getCharAt($word, $i);
        if ($c == '[' || $c == '(') {
            $parCount++;
        } else {
            if ($c == ']' || $c == ')') {
                $parCount--;
            }
        }
        if ($c == '/' && !$parCount) {
            $found = true;
        } else {
            $i++;
        }
    }
    if ($found) {
        $r1 = parseWordField(mb_substr($word, 0, $i), $modelType, $modelNo, $restr);
        $r2 = parseWordField(mb_substr($word, $i + 1), $modelType, $modelNo, $restr);
        return array_merge($r1, $r2);
    }
    if (text_endsWith($word, ']')) {
        $pos = mb_strrpos($word, '[');
        assert($pos !== false);
        $extra = mb_substr($word, $pos);
        $results = parseWordField(mb_substr($word, 0, $pos), $modelType, $modelNo, $restr);
        assert(count($results));
        appendExtra($results[count($results) - 1], $extra);
        return $results;
    }
    if (text_endsWith($word, ')')) {
        $pos = mb_strrpos($word, '(');
        assert($pos !== false);
        $extra = mb_substr($word, $pos);
        $results = parseWordField(mb_substr($word, 0, $pos), $modelType, $modelNo, $restr);
        assert(count($results));
        // See if $extra contains a model number. If so, use it on the last model.
        list($modelType, $modelNo, $restr) = parseModel($extra);
        if ($modelType && $modelNo) {
            $results[count($results) - 1]->modelType = $modelType;
            $results[count($results) - 1]->modelNumber = $modelNo;
            $results[count($results) - 1]->restriction = $restr;
        }
        appendExtra($results[count($results) - 1], $extra);
        // If $extra dictates a part of speech, apply it to all the lexems
        if (text_contains($extra, 's.f.inv.') || text_contains($extra, 's.f. în expr.') || text_contains($extra, 's.m.inv.') || text_contains($extra, 's.n.inv.') || text_contains($extra, 'adj.inv.') || text_contains($extra, 'adv.') || text_contains($extra, 'conj.') || text_contains($extra, 'prep.') || text_contains($extra, 'interj.')) {
            foreach ($results as $l) {
                $l->modelType = 'I';
                $l->modelNumber = '1';
                $l->restriction = '';
            }
        }
        return $results;
    }
    $parts = split(',', $word);
    if (count($parts) >= 2) {
        $results = array();
        foreach ($parts as $part) {
            $results = array_merge($results, parseWordField($part, $modelType, $modelNo, $restr));
        }
        return $results;
    }
    $extra = text_contains($word, '-') ? $word : '';
    $word = str_replace('-', '', $word);
    $len = mb_strlen($word);
    $found = false;
    for ($i = 0; $i < $len && !$found; $i++) {
        $c = text_getCharAt($word, $i);
        if (text_isLowercase($c)) {
            $found = true;
            $word = text_insert($word, "'", $i);
        }
    }
    $word = text_unicodeToLower($word);
    $l = Lexem::create($word, $modelType, $modelNo, $restr);
    appendExtra($l, $extra);
    $l->isLoc = true;
    return array($l);
}