function internalizeLexicon($name) { $name = text_shorthandToUnicode($name); $name = str_replace(array('á', 'Á', 'ắ', 'Ắ', 'ấ', 'Ấ', 'é', 'É', 'í', 'Í', 'î́', 'Î́', 'ó', 'Ó', 'ú', 'Ú', 'ý', 'Ý'), array("'a", "'A", "'ă", "'Ă", "'â", "'Â", "'e", "'E", "'i", "'I", "'î", "'Î", "'o", "'O", "'u", "'U", "'y", "'Y"), $name); //$name = text_removeAccents($name); $name = trim($name); $name = strip_tags($name); $name = text_unicodeToLower($name); // Strip HTML escape codes $name = preg_replace("/&[^;]+;/", "", $name); // Strip all illegal characters $result = ''; $len = mb_strlen($name); for ($i = 0; $i < $len; $i++) { $c = text_getCharAt($name, $i); if (strstr(' !@#$%^&*()-_+=\\|[]{},.<>/?;:"`~0123456789', $c) === FALSE) { $result .= $c; } } return $result; }
/** * Change the case of letters in $word to match those in $like **/ function matchCase($word, $like) { $len = min(mb_strlen($word), mb_strlen($like)); for ($i = 0; $i < $len; $i++) { $cWord = text_getCharAt($word, $i); $cLike = text_getCharAt($like, $i); if (text_isUppercase($cLike)) { $word = mb_substr($word, 0, $i) . text_unicodeToUpper($cWord) . mb_substr($word, $i + 1); } else { $word = mb_substr($word, 0, $i) . text_unicodeToLower($cWord) . mb_substr($word, $i + 1); } } return $word; }
<?php require_once '../../phplib/util.php'; $defIds = file('defIds.txt'); foreach ($defIds as $i => $defId) { $defId = trim($defId); if (!$defId) { continue; } $def = Definition::load($defId); print $i + 1 . "/" . count($defIds) . " {$defId}\n"; print "{$def->internalRep}\n"; $newRep = ''; $len = mb_strlen($def->internalRep); for ($i = 0; $i < $len; $i++) { $c = text_getCharAt($def->internalRep, $i); if ($c == '|') { $mid = mb_strpos($def->internalRep, '|', $i + 1); $close = mb_strpos($def->internalRep, '|', $mid + 1); $text = mb_substr($def->internalRep, $i + 1, $mid - $i - 1); $ref = mb_substr($def->internalRep, $mid + 1, $close - $mid - 1); print "|{$text}|{$ref}|\n"; $i = $close; $c = readChar(); if ($c == 'k') { $newRep .= "|{$text}|{$ref}|"; } else { if ($c == 'd') { $newRep .= $text; } }
function myConvert($s) { $map = array('<' => '<', '<' => '<', '>' => '>', 'ē' => 'ē', '%' => '\\%', '%' => '\\%', '%' => '\\%', '~' => '\\~', ''' => "\\'", ''' => "\\'", '’' => "\\'", '´' => "\\'", '́' => "\\'", '“' => '"', '”' => '"', '„' => '"', '*' => '\\*', '*' => '\\*', '*' => '\\*', '*' => '\\*', '◊' => '*', '♦' => '**', '@' => '\\@', ',c' => ', c', ',C' => ', C', ',s' => ', s', ',S' => ', S', ',t' => ', t', ',T' => ', T', '–' => '-', '—' => '-'); $s = str_replace(array_keys($map), array_values($map), $s); $len = mb_strlen($s); $state = 0; // 0 = normal, 1 = after &, 2 = &# $chunk = ''; $result = ''; for ($i = 0; $i < $len; $i++) { $char = text_getCharAt($s, $i); if ($state == 0) { if ($char == '&') { $chunk = $char; $state = 1; } else { $result .= $char; $chunk = ''; } } else { if ($state == 1) { if ($char == '#') { $chunk .= $char; $state = 2; } else { if ($char == '&') { $result .= $chunk; $chunk = $char; } else { $result .= $chunk; $result .= $char; $state = 0; $chunk = ''; } } } else { if ($state == 2) { if ($char == ';') { $chunk .= $char; $result .= processSequence($chunk); $chunk = ''; $state = 0; } else { if ($char == '&') { $result .= $chunk; $chunk = $char; $state = 1; } else { $chunk .= $char; } } } else { print "ERROR!\n"; exit(1); } } } } $result .= $chunk; return $result; }
$ambiguousLexems = true; } else { $noLexems = true; } } $rep = $def->internalRep; $len = mb_strlen($rep); $newRep = ''; $prevC = ''; $curInflection = 0; //print "Examining {$def->internalRep}\n"; for ($i = 0; $i < $len; $i++) { $c = text_getCharAt($rep, $i); if (!text_isUnicodeLetter($prevC) && $c == '-' && $i <= MAX_LEN) { $j = $i + 1; while (text_isUnicodeLetter(text_getCharAt($rep, $j))) { $j++; } $chunk = mb_substr($rep, $i, $j - $i); if ($chunk != '-') { $suffix = mb_substr($chunk, 1); //print "{$def->id} [{$def->lexicon}] $i [$chunk]\n"; if ($lexem) { $matchingForm = null; foreach ($inflections as $inflId) { $wls = WordList::loadByLexemIdInflectionId($lexem->id, $inflId); foreach ($wls as $wl) { if (matchesWithAccent($wl->form, $suffix)) { $matchingForm = $wl->form; //print "Matching [{$wl->form}] to [$chunk]\n"; }
function parseWordField($word, $modelType, $modelNo, $restr) { $word = trim($word); // Look for a slash not included in brackets $len = mb_strlen($word); $parCount = 0; $i = 0; $found = false; while ($i < $len && !$found) { $c = text_getCharAt($word, $i); if ($c == '[' || $c == '(') { $parCount++; } else { if ($c == ']' || $c == ')') { $parCount--; } } if ($c == '/' && !$parCount) { $found = true; } else { $i++; } } if ($found) { $r1 = parseWordField(mb_substr($word, 0, $i), $modelType, $modelNo, $restr); $r2 = parseWordField(mb_substr($word, $i + 1), $modelType, $modelNo, $restr); return array_merge($r1, $r2); } if (text_endsWith($word, ']')) { $pos = mb_strrpos($word, '['); assert($pos !== false); $extra = mb_substr($word, $pos); $results = parseWordField(mb_substr($word, 0, $pos), $modelType, $modelNo, $restr); assert(count($results)); appendExtra($results[count($results) - 1], $extra); return $results; } if (text_endsWith($word, ')')) { $pos = mb_strrpos($word, '('); assert($pos !== false); $extra = mb_substr($word, $pos); $results = parseWordField(mb_substr($word, 0, $pos), $modelType, $modelNo, $restr); assert(count($results)); // See if $extra contains a model number. If so, use it on the last model. list($modelType, $modelNo, $restr) = parseModel($extra); if ($modelType && $modelNo) { $results[count($results) - 1]->modelType = $modelType; $results[count($results) - 1]->modelNumber = $modelNo; $results[count($results) - 1]->restriction = $restr; } appendExtra($results[count($results) - 1], $extra); // If $extra dictates a part of speech, apply it to all the lexems if (text_contains($extra, 's.f.inv.') || text_contains($extra, 's.f. în expr.') || text_contains($extra, 's.m.inv.') || text_contains($extra, 's.n.inv.') || text_contains($extra, 'adj.inv.') || text_contains($extra, 'adv.') || text_contains($extra, 'conj.') || text_contains($extra, 'prep.') || text_contains($extra, 'interj.')) { foreach ($results as $l) { $l->modelType = 'I'; $l->modelNumber = '1'; $l->restriction = ''; } } return $results; } $parts = split(',', $word); if (count($parts) >= 2) { $results = array(); foreach ($parts as $part) { $results = array_merge($results, parseWordField($part, $modelType, $modelNo, $restr)); } return $results; } $extra = text_contains($word, '-') ? $word : ''; $word = str_replace('-', '', $word); $len = mb_strlen($word); $found = false; for ($i = 0; $i < $len && !$found; $i++) { $c = text_getCharAt($word, $i); if (text_isLowercase($c)) { $found = true; $word = text_insert($word, "'", $i); } } $word = text_unicodeToLower($word); $l = Lexem::create($word, $modelType, $modelNo, $restr); appendExtra($l, $extra); $l->isLoc = true; return array($l); }