function cleanupReferences($s) { global $dropped; global $kept; $result = ''; $text = ''; $ref = ''; $mode = 0; // 0 = not between bars; 1 = text; 2 = reference for ($i = 0; $i < strlen($s); $i++) { $char = $s[$i]; if ($char == '|') { if ($mode == 2) { $sText = simplifyText($text); $sRef = simplifyText($ref); if ($sText == $sRef || $sRef == '') { $result .= $text; $dropped++; } else { if (isInflectedForm($sText, $sRef)) { $result .= $text; $dropped++; } else { if (text_endsWith($sText, ' ' . $ref)) { $result .= $text; $dropped++; } else { if (text_startsWith($sText, $ref . ' ')) { $result .= $text; $dropped++; } else { //print "Keeping reference |$text|$ref|\n"; $result .= "|{$text}|{$ref}|"; $kept++; } } } } $text = ''; $ref = ''; } $mode = ($mode + 1) % 3; } else { switch ($mode) { case 0: $result .= $char; break; case 1: $text .= $char; break; case 2: $ref .= $char; } } } assert($mode == 0); return $result; }
<?php require_once '../../phplib/util.php'; assert_options(ASSERT_BAIL, 1); debug_off(); $dbResult = mysql_query("select * from lexems where lexem_extra != ''"); $seen = 0; $removed = 0; while (($dbRow = mysql_fetch_assoc($dbResult)) != null) { $l = Lexem::createFromDbRow($dbRow); $seen++; $extra = $l->extra; if (text_startsWith($extra, '[') && text_endsWith($extra, ']')) { $extra = mb_substr($extra, 1, mb_strlen($extra) - 2); } if (text_startsWith($extra, '(') && text_endsWith($extra, ')')) { $extra = mb_substr($extra, 1, mb_strlen($extra) - 2); } // Sometimes the extra is just the model number $found = $extra == $l->modelType . $l->modelNumber . $l->restriction; // Sometimes the extra refers to a homonym's model if (!$found) { $homonyms = $l->loadHomonyms(); foreach ($homonyms as $h) { $found |= $extra == $h->modelType . $h->modelNumber . $h->restriction; } } if (!$found) { $defs = Definition::loadByLexemId($l->id); foreach ($defs as $def) { $found |= text_contains($def->internalRep, $extra);
function normalizeForm($form) { // Special case 1: staro-s/ș-ti or staro-s/ș-tii or staro-s/ș-tilor if (text_startsWith($form, 'staro-s/ș-ti')) { $rest = mb_substr($form, 12); return array("starosti{$rest}", "staroști{$rest}"); } // Special case 2: [a]iastalaltă (-tă-) if ($form == "[a]iastalaltă (-tă-)") { return array("aiastalaltă", "iastalaltă"); } $form = str_replace(array('-', 'á', 'é', 'í', 'ó', 'ú'), array('', "'a", "'e", "'i", "'o", "'u"), $form); $form = trim($form); return normalizeFormRecursively($form); }
/** * Returns a list containing the next token and the new stream position. * If we reach the end of the file, the token is empty. * Tokens are opening tags (without the attributes), closing tags or text. **/ function getNextToken($pos) { global $data; global $dataLen; while ($pos < $dataLen && ctype_space($data[$pos])) { $pos++; } if ($pos >= $dataLen) { return array('', $dataLen); } $result = ''; // If we hit a '<' sign, parse tag. if ($data[$pos] == '<') { do { $result .= $data[$pos]; $done = text_startsWith($result, '<!--') ? text_endsWith($result, '-->') : $data[$pos] == '>'; $pos++; } while (!$done); // Strip the attributes $tagEnd = 1; while (!ctype_space($result[$tagEnd]) && $result[$tagEnd] != '>') { $tagEnd++; } $result = substr($result, 0, $tagEnd) . '>'; return array($result, $pos); } // Parse text to the next '<' sign or EOF. while ($pos < $dataLen && $data[$pos] != '<') { $result .= $data[$pos]; $pos++; } return array(trim($result), $pos); }