示例#1
0
function readAndFormatFile($fileName)
{
    $fp = fopen($fileName, 'r');
    $data = fread($fp, 100000000);
    fclose($fp);
    $data = mb_convert_encoding($data, 'UTF-8', 'ISO-8859-1');
    $data = str_replace(array("\n", "\r", '&amp;', '&quot;', '&#7855;', '<span style="">&nbsp;</span>', '&#258;', '&#259;', '&#350;', '&#351;', '&#354;', '&#355;'), array(' ', ' ', '&', '"', 'ă', '', 'Ă', 'ă', 'Ș', 'Ș', 'Ț', 'ț'), $data);
    $data = preg_replace('/<span style="">(&nbsp;| )*<\\/span>/', '', $data);
    $data = preg_replace('/<font class="font\\d+">/', '', $data);
    $data = str_replace('</font>', '', $data);
    // Collapse multiple spaces
    $data = preg_replace('/ +/', ' ', $data);
    $data = text_unicodeToLower($data);
    return $data;
}
示例#2
0
function internalizeLexicon($name)
{
    $name = text_shorthandToUnicode($name);
    $name = str_replace(array('á', 'Á', 'ắ', 'Ắ', 'ấ', 'Ấ', 'é', 'É', 'í', 'Í', 'î́', 'Î́', 'ó', 'Ó', 'ú', 'Ú', 'ý', 'Ý'), array("'a", "'A", "'ă", "'Ă", "'â", "'Â", "'e", "'E", "'i", "'I", "'î", "'Î", "'o", "'O", "'u", "'U", "'y", "'Y"), $name);
    //$name = text_removeAccents($name);
    $name = trim($name);
    $name = strip_tags($name);
    $name = text_unicodeToLower($name);
    // Strip HTML escape codes
    $name = preg_replace("/&[^;]+;/", "", $name);
    // Strip all illegal characters
    $result = '';
    $len = mb_strlen($name);
    for ($i = 0; $i < $len; $i++) {
        $c = text_getCharAt($name, $i);
        if (strstr(' !@#$%^&*()-_+=\\|[]{},.<>/?;:"`~0123456789', $c) === FALSE) {
            $result .= $c;
        }
    }
    return $result;
}
示例#3
0
             $lexems[] = $lexem;
         }
         // Now associate every lexem with every definition
         foreach ($defs as $defAssoc) {
             foreach ($lexems as $lexemAssoc) {
                 LexemDefinitionMap::associate($lexemAssoc->id, $defAssoc->id);
             }
         }
     }
     foreach ($defs as $fixDef) {
         $fixFirstAt = strpos($fixDef->internalRep, '@');
         $fixSecondAt = strpos($fixDef->internalRep, '@', $fixFirstAt + 1);
         assert($fixFirstAt === 0);
         assert($fixSecondAt !== false);
         $fixText = trim(substr($fixDef->internalRep, $fixFirstAt + 1, $fixSecondAt - $fixFirstAt - 1));
         if (!text_contains($fixText, '-') && !text_contains($fixText, ' ') && str_replace('-', '', $normText) == text_unicodeToLower(text_removeAccents($fixText))) {
             $prevPos = 0;
             while (($pos = mb_strpos($normText, '-', $prevPos)) !== false) {
                 $fixText = text_insert($fixText, '-', $pos);
                 $prevPos = $pos + 1;
             }
             $fixDef->internalRep = substr($fixDef->internalRep, 0, $fixFirstAt + 1) . $fixText . substr($fixDef->internalRep, $fixSecondAt);
             $fixDef->htmlRep = text_htmlize($fixDef->internalRep);
             $fixDef->save();
             print "    [{$fixDef->internalRep}]\n";
         }
     }
     $split++;
     $l->delete();
     $foundHyphenation = true;
 }
示例#4
0
     $homonyms = $l->loadHomonyms();
     foreach ($homonyms as $h) {
         $found |= $extra == $h->modelType . $h->modelNumber . $h->restriction;
     }
 }
 if (!$found) {
     $defs = Definition::loadByLexemId($l->id);
     foreach ($defs as $def) {
         $found |= text_contains($def->internalRep, $extra);
     }
 }
 // Sometimes the extra contains more hyphenation information than
 // the definitions, but some of that information is obvious and can
 // be deleted.
 if (!$found) {
     $lower = text_unicodeToLower($extra);
     foreach ($defs as $def) {
         $letterSet = 'A-Za-zăâîșțĂÂÎȘȚ';
         $letter = "[{$letterSet}]";
         $letterOrDash = "[-{$letterSet}]";
         $other = "[^-{$letterSet}]";
         $regexp = "{$other}({$letterOrDash}+-{$letterOrDash}+){$other}";
         $matches = array();
         $result = preg_match_all("/{$regexp}/", $def->internalRep, $matches);
         foreach ($matches[1] as $match) {
             $found |= text_contains($lower, $match);
         }
     }
 }
 // Sometimes the extra indicates the part of speech
 if (!$found) {
示例#5
0
/**
 * Change the case of letters in $word to match those in $like
 **/
function matchCase($word, $like)
{
    $len = min(mb_strlen($word), mb_strlen($like));
    for ($i = 0; $i < $len; $i++) {
        $cWord = text_getCharAt($word, $i);
        $cLike = text_getCharAt($like, $i);
        if (text_isUppercase($cLike)) {
            $word = mb_substr($word, 0, $i) . text_unicodeToUpper($cWord) . mb_substr($word, $i + 1);
        } else {
            $word = mb_substr($word, 0, $i) . text_unicodeToLower($cWord) . mb_substr($word, $i + 1);
        }
    }
    return $word;
}
示例#6
0
function simplifyText($s)
{
    $s = preg_replace("/[@\$^0-9()%.]/", "", text_unicodeToLower(trim($s)));
    if (text_endsWith($s, '-')) {
        $s = substr($s, 0, strlen($s) - 1);
    }
    return $s;
}
$dbResult = mysql_query("select * from lexems where lexem_model_type = 'T' " . "order by lexem_neaccentuat");
$seen = 0;
$removed = 0;
$biologyTerms = array('plantă', 'pom', 'arbore', 'arbust', 'bot', 'zool', 'mamifer', 'animal');
while (($dbRow = mysql_fetch_assoc($dbResult)) != null) {
    $l = Lexem::createFromDbRow($dbRow);
    $seen++;
    $defs = Definition::loadByLexemId($l->id);
    $matchingLexicon = false;
    $biology = false;
    $appears = false;
    foreach ($defs as $def) {
        if (str_replace('î', 'â', $def->lexicon) == str_replace('î', 'â', $l->unaccented)) {
            $matchingLexicon = true;
        }
        $rep = text_unicodeToLower($def->internalRep);
        $rep = str_replace(array('$', '@', '%', '.', ',', '(', ')', ';', ':'), array('', '', '', '', '', '', '', '', ''), $rep);
        $words = split("[ \n\t]", $rep);
        foreach ($words as $word) {
            $biology |= in_array($word, $biologyTerms);
            $appears |= $l->unaccented == $word;
        }
    }
    if (!$matchingLexicon && !text_contains($l->form, ' ') && $biology) {
        if ($appears) {
            print "Changing {$l->id} {$l->form} to I2\n";
            $l->modelType = 'I';
            $l->modelNumber = '2';
            $l->restriction = '';
            $l->noAccent = true;
            $l->save();
示例#8
0
/**
 * 
 */
function matchesWithAccent($form, $suffix)
{
    $suffix = text_unicodeToLower($suffix);
    $suffixExpl = str_replace($GLOBALS['text_accented'], $GLOBALS['text_explicitAccent'], $suffix);
    $formHasAccent = strstr($form, "'") !== false;
    $suffixHasAccent = strstr($suffixExpl, "'") !== false;
    if ($formHasAccent && $suffixHasAccent) {
        $formImpl = str_replace($GLOBALS['text_explicitAccent'], $GLOBALS['text_accented'], $form);
        return text_endsWith($formImpl, $suffix);
    } else {
        if ($formHasAccent && !$suffixHasAccent) {
            $formNoAccent = str_replace("'", "", $form);
            return text_endsWith($formNoAccent, $suffix);
        } else {
            if (!$formHasAccent && $suffixHasAccent) {
                $suffixNoAccent = str_replace("'", "", $suffixExpl);
                return text_endsWith($form, $suffixNoAccent);
            } else {
                // No accents
                return text_endsWith($form, $suffix);
            }
        }
    }
}
示例#9
0
function parseWordField($word, $modelType, $modelNo, $restr)
{
    $word = trim($word);
    // Look for a slash not included in brackets
    $len = mb_strlen($word);
    $parCount = 0;
    $i = 0;
    $found = false;
    while ($i < $len && !$found) {
        $c = text_getCharAt($word, $i);
        if ($c == '[' || $c == '(') {
            $parCount++;
        } else {
            if ($c == ']' || $c == ')') {
                $parCount--;
            }
        }
        if ($c == '/' && !$parCount) {
            $found = true;
        } else {
            $i++;
        }
    }
    if ($found) {
        $r1 = parseWordField(mb_substr($word, 0, $i), $modelType, $modelNo, $restr);
        $r2 = parseWordField(mb_substr($word, $i + 1), $modelType, $modelNo, $restr);
        return array_merge($r1, $r2);
    }
    if (text_endsWith($word, ']')) {
        $pos = mb_strrpos($word, '[');
        assert($pos !== false);
        $extra = mb_substr($word, $pos);
        $results = parseWordField(mb_substr($word, 0, $pos), $modelType, $modelNo, $restr);
        assert(count($results));
        appendExtra($results[count($results) - 1], $extra);
        return $results;
    }
    if (text_endsWith($word, ')')) {
        $pos = mb_strrpos($word, '(');
        assert($pos !== false);
        $extra = mb_substr($word, $pos);
        $results = parseWordField(mb_substr($word, 0, $pos), $modelType, $modelNo, $restr);
        assert(count($results));
        // See if $extra contains a model number. If so, use it on the last model.
        list($modelType, $modelNo, $restr) = parseModel($extra);
        if ($modelType && $modelNo) {
            $results[count($results) - 1]->modelType = $modelType;
            $results[count($results) - 1]->modelNumber = $modelNo;
            $results[count($results) - 1]->restriction = $restr;
        }
        appendExtra($results[count($results) - 1], $extra);
        // If $extra dictates a part of speech, apply it to all the lexems
        if (text_contains($extra, 's.f.inv.') || text_contains($extra, 's.f. în expr.') || text_contains($extra, 's.m.inv.') || text_contains($extra, 's.n.inv.') || text_contains($extra, 'adj.inv.') || text_contains($extra, 'adv.') || text_contains($extra, 'conj.') || text_contains($extra, 'prep.') || text_contains($extra, 'interj.')) {
            foreach ($results as $l) {
                $l->modelType = 'I';
                $l->modelNumber = '1';
                $l->restriction = '';
            }
        }
        return $results;
    }
    $parts = split(',', $word);
    if (count($parts) >= 2) {
        $results = array();
        foreach ($parts as $part) {
            $results = array_merge($results, parseWordField($part, $modelType, $modelNo, $restr));
        }
        return $results;
    }
    $extra = text_contains($word, '-') ? $word : '';
    $word = str_replace('-', '', $word);
    $len = mb_strlen($word);
    $found = false;
    for ($i = 0; $i < $len && !$found; $i++) {
        $c = text_getCharAt($word, $i);
        if (text_isLowercase($c)) {
            $found = true;
            $word = text_insert($word, "'", $i);
        }
    }
    $word = text_unicodeToLower($word);
    $l = Lexem::create($word, $modelType, $modelNo, $restr);
    appendExtra($l, $extra);
    $l->isLoc = true;
    return array($l);
}