foreach ($defs as $defAssoc) { foreach ($lexems as $lexemAssoc) { LexemDefinitionMap::associate($lexemAssoc->id, $defAssoc->id); } } } foreach ($defs as $fixDef) { $fixFirstAt = strpos($fixDef->internalRep, '@'); $fixSecondAt = strpos($fixDef->internalRep, '@', $fixFirstAt + 1); assert($fixFirstAt === 0); assert($fixSecondAt !== false); $fixText = trim(substr($fixDef->internalRep, $fixFirstAt + 1, $fixSecondAt - $fixFirstAt - 1)); if (!text_contains($fixText, '-') && !text_contains($fixText, ' ') && str_replace('-', '', $normText) == text_unicodeToLower(text_removeAccents($fixText))) { $prevPos = 0; while (($pos = mb_strpos($normText, '-', $prevPos)) !== false) { $fixText = text_insert($fixText, '-', $pos); $prevPos = $pos + 1; } $fixDef->internalRep = substr($fixDef->internalRep, 0, $fixFirstAt + 1) . $fixText . substr($fixDef->internalRep, $fixSecondAt); $fixDef->htmlRep = text_htmlize($fixDef->internalRep); $fixDef->save(); print " [{$fixDef->internalRep}]\n"; } } $split++; $l->delete(); $foundHyphenation = true; } } //print "NOT OK: {$l->unaccented}\n"; }
function parseWordField($word, $modelType, $modelNo, $restr) { $word = trim($word); // Look for a slash not included in brackets $len = mb_strlen($word); $parCount = 0; $i = 0; $found = false; while ($i < $len && !$found) { $c = text_getCharAt($word, $i); if ($c == '[' || $c == '(') { $parCount++; } else { if ($c == ']' || $c == ')') { $parCount--; } } if ($c == '/' && !$parCount) { $found = true; } else { $i++; } } if ($found) { $r1 = parseWordField(mb_substr($word, 0, $i), $modelType, $modelNo, $restr); $r2 = parseWordField(mb_substr($word, $i + 1), $modelType, $modelNo, $restr); return array_merge($r1, $r2); } if (text_endsWith($word, ']')) { $pos = mb_strrpos($word, '['); assert($pos !== false); $extra = mb_substr($word, $pos); $results = parseWordField(mb_substr($word, 0, $pos), $modelType, $modelNo, $restr); assert(count($results)); appendExtra($results[count($results) - 1], $extra); return $results; } if (text_endsWith($word, ')')) { $pos = mb_strrpos($word, '('); assert($pos !== false); $extra = mb_substr($word, $pos); $results = parseWordField(mb_substr($word, 0, $pos), $modelType, $modelNo, $restr); assert(count($results)); // See if $extra contains a model number. If so, use it on the last model. list($modelType, $modelNo, $restr) = parseModel($extra); if ($modelType && $modelNo) { $results[count($results) - 1]->modelType = $modelType; $results[count($results) - 1]->modelNumber = $modelNo; $results[count($results) - 1]->restriction = $restr; } appendExtra($results[count($results) - 1], $extra); // If $extra dictates a part of speech, apply it to all the lexems if (text_contains($extra, 's.f.inv.') || text_contains($extra, 's.f. în expr.') || text_contains($extra, 's.m.inv.') || text_contains($extra, 's.n.inv.') || text_contains($extra, 'adj.inv.') || text_contains($extra, 'adv.') || text_contains($extra, 'conj.') || text_contains($extra, 'prep.') || text_contains($extra, 'interj.')) { foreach ($results as $l) { $l->modelType = 'I'; $l->modelNumber = '1'; $l->restriction = ''; } } return $results; } $parts = split(',', $word); if (count($parts) >= 2) { $results = array(); foreach ($parts as $part) { $results = array_merge($results, parseWordField($part, $modelType, $modelNo, $restr)); } return $results; } $extra = text_contains($word, '-') ? $word : ''; $word = str_replace('-', '', $word); $len = mb_strlen($word); $found = false; for ($i = 0; $i < $len && !$found; $i++) { $c = text_getCharAt($word, $i); if (text_isLowercase($c)) { $found = true; $word = text_insert($word, "'", $i); } } $word = text_unicodeToLower($word); $l = Lexem::create($word, $modelType, $modelNo, $restr); appendExtra($l, $extra); $l->isLoc = true; return array($l); }