function readAndFormatFile($fileName) { $fp = fopen($fileName, 'r'); $data = fread($fp, 100000000); fclose($fp); $data = mb_convert_encoding($data, 'UTF-8', 'ISO-8859-1'); $data = str_replace(array("\n", "\r", '&', '"', 'ắ', '<span style=""> </span>', 'Ă', 'ă', 'Ş', 'ş', 'Ţ', 'ţ'), array(' ', ' ', '&', '"', 'ă', '', 'Ă', 'ă', 'Ș', 'Ș', 'Ț', 'ț'), $data); $data = preg_replace('/<span style="">( | )*<\\/span>/', '', $data); $data = preg_replace('/<font class="font\\d+">/', '', $data); $data = str_replace('</font>', '', $data); // Collapse multiple spaces $data = preg_replace('/ +/', ' ', $data); $data = text_unicodeToLower($data); return $data; }
function internalizeLexicon($name) { $name = text_shorthandToUnicode($name); $name = str_replace(array('á', 'Á', 'ắ', 'Ắ', 'ấ', 'Ấ', 'é', 'É', 'í', 'Í', 'î́', 'Î́', 'ó', 'Ó', 'ú', 'Ú', 'ý', 'Ý'), array("'a", "'A", "'ă", "'Ă", "'â", "'Â", "'e", "'E", "'i", "'I", "'î", "'Î", "'o", "'O", "'u", "'U", "'y", "'Y"), $name); //$name = text_removeAccents($name); $name = trim($name); $name = strip_tags($name); $name = text_unicodeToLower($name); // Strip HTML escape codes $name = preg_replace("/&[^;]+;/", "", $name); // Strip all illegal characters $result = ''; $len = mb_strlen($name); for ($i = 0; $i < $len; $i++) { $c = text_getCharAt($name, $i); if (strstr(' !@#$%^&*()-_+=\\|[]{},.<>/?;:"`~0123456789', $c) === FALSE) { $result .= $c; } } return $result; }
$lexems[] = $lexem; } // Now associate every lexem with every definition foreach ($defs as $defAssoc) { foreach ($lexems as $lexemAssoc) { LexemDefinitionMap::associate($lexemAssoc->id, $defAssoc->id); } } } foreach ($defs as $fixDef) { $fixFirstAt = strpos($fixDef->internalRep, '@'); $fixSecondAt = strpos($fixDef->internalRep, '@', $fixFirstAt + 1); assert($fixFirstAt === 0); assert($fixSecondAt !== false); $fixText = trim(substr($fixDef->internalRep, $fixFirstAt + 1, $fixSecondAt - $fixFirstAt - 1)); if (!text_contains($fixText, '-') && !text_contains($fixText, ' ') && str_replace('-', '', $normText) == text_unicodeToLower(text_removeAccents($fixText))) { $prevPos = 0; while (($pos = mb_strpos($normText, '-', $prevPos)) !== false) { $fixText = text_insert($fixText, '-', $pos); $prevPos = $pos + 1; } $fixDef->internalRep = substr($fixDef->internalRep, 0, $fixFirstAt + 1) . $fixText . substr($fixDef->internalRep, $fixSecondAt); $fixDef->htmlRep = text_htmlize($fixDef->internalRep); $fixDef->save(); print " [{$fixDef->internalRep}]\n"; } } $split++; $l->delete(); $foundHyphenation = true; }
$homonyms = $l->loadHomonyms(); foreach ($homonyms as $h) { $found |= $extra == $h->modelType . $h->modelNumber . $h->restriction; } } if (!$found) { $defs = Definition::loadByLexemId($l->id); foreach ($defs as $def) { $found |= text_contains($def->internalRep, $extra); } } // Sometimes the extra contains more hyphenation information than // the definitions, but some of that information is obvious and can // be deleted. if (!$found) { $lower = text_unicodeToLower($extra); foreach ($defs as $def) { $letterSet = 'A-Za-zăâîșțĂÂÎȘȚ'; $letter = "[{$letterSet}]"; $letterOrDash = "[-{$letterSet}]"; $other = "[^-{$letterSet}]"; $regexp = "{$other}({$letterOrDash}+-{$letterOrDash}+){$other}"; $matches = array(); $result = preg_match_all("/{$regexp}/", $def->internalRep, $matches); foreach ($matches[1] as $match) { $found |= text_contains($lower, $match); } } } // Sometimes the extra indicates the part of speech if (!$found) {
/** * Change the case of letters in $word to match those in $like **/ function matchCase($word, $like) { $len = min(mb_strlen($word), mb_strlen($like)); for ($i = 0; $i < $len; $i++) { $cWord = text_getCharAt($word, $i); $cLike = text_getCharAt($like, $i); if (text_isUppercase($cLike)) { $word = mb_substr($word, 0, $i) . text_unicodeToUpper($cWord) . mb_substr($word, $i + 1); } else { $word = mb_substr($word, 0, $i) . text_unicodeToLower($cWord) . mb_substr($word, $i + 1); } } return $word; }
function simplifyText($s) { $s = preg_replace("/[@\$^0-9()%.]/", "", text_unicodeToLower(trim($s))); if (text_endsWith($s, '-')) { $s = substr($s, 0, strlen($s) - 1); } return $s; }
$dbResult = mysql_query("select * from lexems where lexem_model_type = 'T' " . "order by lexem_neaccentuat"); $seen = 0; $removed = 0; $biologyTerms = array('plantă', 'pom', 'arbore', 'arbust', 'bot', 'zool', 'mamifer', 'animal'); while (($dbRow = mysql_fetch_assoc($dbResult)) != null) { $l = Lexem::createFromDbRow($dbRow); $seen++; $defs = Definition::loadByLexemId($l->id); $matchingLexicon = false; $biology = false; $appears = false; foreach ($defs as $def) { if (str_replace('î', 'â', $def->lexicon) == str_replace('î', 'â', $l->unaccented)) { $matchingLexicon = true; } $rep = text_unicodeToLower($def->internalRep); $rep = str_replace(array('$', '@', '%', '.', ',', '(', ')', ';', ':'), array('', '', '', '', '', '', '', '', ''), $rep); $words = split("[ \n\t]", $rep); foreach ($words as $word) { $biology |= in_array($word, $biologyTerms); $appears |= $l->unaccented == $word; } } if (!$matchingLexicon && !text_contains($l->form, ' ') && $biology) { if ($appears) { print "Changing {$l->id} {$l->form} to I2\n"; $l->modelType = 'I'; $l->modelNumber = '2'; $l->restriction = ''; $l->noAccent = true; $l->save();
/** * */ function matchesWithAccent($form, $suffix) { $suffix = text_unicodeToLower($suffix); $suffixExpl = str_replace($GLOBALS['text_accented'], $GLOBALS['text_explicitAccent'], $suffix); $formHasAccent = strstr($form, "'") !== false; $suffixHasAccent = strstr($suffixExpl, "'") !== false; if ($formHasAccent && $suffixHasAccent) { $formImpl = str_replace($GLOBALS['text_explicitAccent'], $GLOBALS['text_accented'], $form); return text_endsWith($formImpl, $suffix); } else { if ($formHasAccent && !$suffixHasAccent) { $formNoAccent = str_replace("'", "", $form); return text_endsWith($formNoAccent, $suffix); } else { if (!$formHasAccent && $suffixHasAccent) { $suffixNoAccent = str_replace("'", "", $suffixExpl); return text_endsWith($form, $suffixNoAccent); } else { // No accents return text_endsWith($form, $suffix); } } } }
function parseWordField($word, $modelType, $modelNo, $restr) { $word = trim($word); // Look for a slash not included in brackets $len = mb_strlen($word); $parCount = 0; $i = 0; $found = false; while ($i < $len && !$found) { $c = text_getCharAt($word, $i); if ($c == '[' || $c == '(') { $parCount++; } else { if ($c == ']' || $c == ')') { $parCount--; } } if ($c == '/' && !$parCount) { $found = true; } else { $i++; } } if ($found) { $r1 = parseWordField(mb_substr($word, 0, $i), $modelType, $modelNo, $restr); $r2 = parseWordField(mb_substr($word, $i + 1), $modelType, $modelNo, $restr); return array_merge($r1, $r2); } if (text_endsWith($word, ']')) { $pos = mb_strrpos($word, '['); assert($pos !== false); $extra = mb_substr($word, $pos); $results = parseWordField(mb_substr($word, 0, $pos), $modelType, $modelNo, $restr); assert(count($results)); appendExtra($results[count($results) - 1], $extra); return $results; } if (text_endsWith($word, ')')) { $pos = mb_strrpos($word, '('); assert($pos !== false); $extra = mb_substr($word, $pos); $results = parseWordField(mb_substr($word, 0, $pos), $modelType, $modelNo, $restr); assert(count($results)); // See if $extra contains a model number. If so, use it on the last model. list($modelType, $modelNo, $restr) = parseModel($extra); if ($modelType && $modelNo) { $results[count($results) - 1]->modelType = $modelType; $results[count($results) - 1]->modelNumber = $modelNo; $results[count($results) - 1]->restriction = $restr; } appendExtra($results[count($results) - 1], $extra); // If $extra dictates a part of speech, apply it to all the lexems if (text_contains($extra, 's.f.inv.') || text_contains($extra, 's.f. în expr.') || text_contains($extra, 's.m.inv.') || text_contains($extra, 's.n.inv.') || text_contains($extra, 'adj.inv.') || text_contains($extra, 'adv.') || text_contains($extra, 'conj.') || text_contains($extra, 'prep.') || text_contains($extra, 'interj.')) { foreach ($results as $l) { $l->modelType = 'I'; $l->modelNumber = '1'; $l->restriction = ''; } } return $results; } $parts = split(',', $word); if (count($parts) >= 2) { $results = array(); foreach ($parts as $part) { $results = array_merge($results, parseWordField($part, $modelType, $modelNo, $restr)); } return $results; } $extra = text_contains($word, '-') ? $word : ''; $word = str_replace('-', '', $word); $len = mb_strlen($word); $found = false; for ($i = 0; $i < $len && !$found; $i++) { $c = text_getCharAt($word, $i); if (text_isLowercase($c)) { $found = true; $word = text_insert($word, "'", $i); } } $word = text_unicodeToLower($word); $l = Lexem::create($word, $modelType, $modelNo, $restr); appendExtra($l, $extra); $l->isLoc = true; return array($l); }