<?php require_once '../../phplib/util.php'; assert_options(ASSERT_BAIL, 1); debug_off(); $dbResult = mysql_query("select * from lexems where lexem_extra != ''"); $seen = 0; $removed = 0; while (($dbRow = mysql_fetch_assoc($dbResult)) != null) { $l = Lexem::createFromDbRow($dbRow); $seen++; $extra = $l->extra; if (text_startsWith($extra, '[') && text_endsWith($extra, ']')) { $extra = mb_substr($extra, 1, mb_strlen($extra) - 2); } if (text_startsWith($extra, '(') && text_endsWith($extra, ')')) { $extra = mb_substr($extra, 1, mb_strlen($extra) - 2); } // Sometimes the extra is just the model number $found = $extra == $l->modelType . $l->modelNumber . $l->restriction; // Sometimes the extra refers to a homonym's model if (!$found) { $homonyms = $l->loadHomonyms(); foreach ($homonyms as $h) { $found |= $extra == $h->modelType . $h->modelNumber . $h->restriction; } } if (!$found) { $defs = Definition::loadByLexemId($l->id); foreach ($defs as $def) { $found |= text_contains($def->internalRep, $extra);
$otherLexems = Lexem::loadByForm($form); if (count($otherLexems)) { print "REMOVING -UL FROM: {$l->form}\n"; foreach ($otherLexems as $otherLexem) { foreach ($defs as $def) { LexemDefinitionMap::associate($otherLexem->id, $def->id); } } $l->delete(); $split++; continue; } } // Split the word in two, if it leads to two existing lexems, and if both // have at least three letters. if (mb_strlen($l->unaccented) >= 8 && (text_endsWith($l->form, 'lui') || text_endsWith($l->form, 'ei') || text_endsWith($l->form, 'ii') || text_endsWith($l->form, 'elor') || text_endsWith($l->form, 'ilor') || text_endsWith($l->form, 'asă') || text_endsWith($l->form, 'scă'))) { $len = mb_strlen($l->unaccented); for ($splitPoint = 3; $splitPoint <= $len - 3; $splitPoint++) { $word1 = mb_substr($l->unaccented, 0, $splitPoint); $word2 = mb_substr($l->unaccented, $splitPoint); $l1 = getNouns($word1); $l2 = getNouns($word2); if (count($l1) && count($l2)) { print "[{$l->form}] [{$word1}] [{$word2}]\n"; $all = array_merge($l1, $l2); foreach ($defs as $def) { foreach ($all as $newLexem) { LexemDefinitionMap::associate($newLexem->id, $def->id); } // Also fix the definition if it is missing a hyphen. $firstAt = strpos($def->internalRep, '@');
require_once '../phplib/ads/adsModule.php'; require_once '../phplib/ads/diverta/divertaAdsModule.php'; $opts = getopt('s:'); if (count($opts) != 1) { print "Usage: fixDivertaBooks -s <start-id>\n"; exit; } // Resolve some ambiguities automatically. List the form that is alphabetically first and specify what to return $PREFERRED_FORMS = array('a' => 'a', 'al' => 'al', 'carte' => 'carte', 'cartea' => 'cartea', 'clasa' => 'clasa', 'fara' => 'fără', 'i' => 'i', 'ii' => 'ii', 'in' => 'în', 'la' => 'la', 'mai' => 'mai', 'mare' => 'mare', 'marea' => 'marea', 'povesti' => 'povești', 'print' => 'prinț', 'printul' => 'prințul', 's' => 's', 'sa' => 'să', 'si' => 'și', 'teste' => 'teste', 'ti' => 'ți', 'timp' => 'timp', 'top' => 'top'); $books = db_find(new DivertaBook(), "id >= {$opts['s']} order by id"); foreach ($books as $book) { print "Loaded: {$book->id} [{$book->title}] [{$book->url}]\n"; $origTitle = $book->title; // Preliminary stuff $book->title = trim($book->title); if (text_endsWith($book->title, ', ***')) { $book->title = substr($book->title, 0, -5); } switch ($book->sku) { case 'YDA00965': $book->title = 'Dicționar vizual spaniol-român'; break; case 'YHG00310': $book->title = '77 de rețete celebre și poveștile lor'; break; case 'YHU02030': $book->title = 'Zen aici și acum'; break; case 'YCV00945': $book->title = 'Bărbatul manipulator'; break;
<?php require_once '../phplib/util.php'; $INFL_LONG_INF = 50; $INFL_PART = 52; $LEXEM_EDIT_URL = 'http://dexonline.ro/admin/lexemEdit.php?lexemId='; $dbResult = db_execute("select * from Lexem where modelType in ('V', 'VT') and isLoc order by formNoAccent"); while (!$dbResult->EOF) { $verb = new Lexem(); $verb->set($dbResult->fields); $dbResult->MoveNext(); $ifs = db_find(new InflectedForm(), "lexemId = {$verb->id} and inflectionId = {$INFL_LONG_INF}"); assert(count($ifs) <= 1); if (count($ifs) == 1) { $longInfForm = $ifs[0]->formNoAccent; $longInfModelNumber = text_endsWith($longInfForm, 'are') ? '113' : '107'; $lexems = db_find(new Lexem(), "formNoAccent = '{$longInfForm}' and modelType = 'F' and modelNumber = '{$longInfModelNumber}'"); if (count($lexems) != 1) { print "I {$longInfForm} are " . count($lexems) . " lexeme corespunzătoare\n"; } foreach ($lexems as $longInf) { if (!$longInf->isLoc) { print "I {$longInf->formNoAccent} nu este în LOC {$LEXEM_EDIT_URL}{$longInf->id}\n"; $longInf->isLoc = 1; $longInf->save(); } } } if ($verb->modelType == 'VT') { $ifs = db_find(new InflectedForm(), "lexemId = {$verb->id} and inflectionId = {$INFL_PART}"); $pm = ParticipleModel::loadByVerbModel($verb->modelNumber);
function simplifyText($s) { $s = preg_replace("/[@\$^0-9()%.]/", "", text_unicodeToLower(trim($s))); if (text_endsWith($s, '-')) { $s = substr($s, 0, strlen($s) - 1); } return $s; }
<?php define('NOTE', ' ?[(]?Notă: Definiția este preluată din Dicționar enciclopedic vol\\. .*, Editura Enciclopedică, ....[)]\\.?$'); $query = "select * from Definition where InternalRep rlike '" . NOTE . "'"; $dbResult = mysql_query($query); $count = 0; while ($dbRow = mysql_fetch_assoc($dbResult)) { $def = Definition::createFromDbRow($dbRow); $pos = strpos($def->internalRep, 'Notă:'); $rep = substr($def->internalRep, 0, $pos); if (text_endsWith($rep, '(')) { $rep = substr($rep, 0, -1); } $rep = trim($rep); $def->internalRep = $rep; $definition->htmlRep = text_htmlize($def->internalRep); $def->sourceId = 25; $def->save(); $count++; } print "{$count} definitions modified.\n";
/** * */ function matchesWithAccent($form, $suffix) { $suffix = text_unicodeToLower($suffix); $suffixExpl = str_replace($GLOBALS['text_accented'], $GLOBALS['text_explicitAccent'], $suffix); $formHasAccent = strstr($form, "'") !== false; $suffixHasAccent = strstr($suffixExpl, "'") !== false; if ($formHasAccent && $suffixHasAccent) { $formImpl = str_replace($GLOBALS['text_explicitAccent'], $GLOBALS['text_accented'], $form); return text_endsWith($formImpl, $suffix); } else { if ($formHasAccent && !$suffixHasAccent) { $formNoAccent = str_replace("'", "", $form); return text_endsWith($formNoAccent, $suffix); } else { if (!$formHasAccent && $suffixHasAccent) { $suffixNoAccent = str_replace("'", "", $suffixExpl); return text_endsWith($form, $suffixNoAccent); } else { // No accents return text_endsWith($form, $suffix); } } } }
/** * Returns a list containing the next token and the new stream position. * If we reach the end of the file, the token is empty. * Tokens are opening tags (without the attributes), closing tags or text. **/ function getNextToken($pos) { global $data; global $dataLen; while ($pos < $dataLen && ctype_space($data[$pos])) { $pos++; } if ($pos >= $dataLen) { return array('', $dataLen); } $result = ''; // If we hit a '<' sign, parse tag. if ($data[$pos] == '<') { do { $result .= $data[$pos]; $done = text_startsWith($result, '<!--') ? text_endsWith($result, '-->') : $data[$pos] == '>'; $pos++; } while (!$done); // Strip the attributes $tagEnd = 1; while (!ctype_space($result[$tagEnd]) && $result[$tagEnd] != '>') { $tagEnd++; } $result = substr($result, 0, $tagEnd) . '>'; return array($result, $pos); } // Parse text to the next '<' sign or EOF. while ($pos < $dataLen && $data[$pos] != '<') { $result .= $data[$pos]; $pos++; } return array(trim($result), $pos); }
$d->lexicon = text_extractLexicon($d); $d->status = ST_ACTIVE; if (array_key_exists($d->lexicon, $existing)) { $skipped++; } else { // Handle some special cases where the lexicon needs to be adjusted. if (preg_match("/^@[^@,]+ /", $d->internalRep)) { $pos = strpos($d->internalRep, '@', 1); $text = substr($d->internalRep, 1, $pos - 1); $parts = split(' ', $text); assert(count($parts) >= 2); if ($parts[count($parts) - 1] == 'II.') { $d->lexicon = text_internalizeWordName($parts[0]); $d->status = ST_PENDING; } else { if (count($parts) == 2 && text_endsWith($parts[1], '/')) { // Use the first part only, because the second one is the pronunciation $d->lexicon = text_internalizeWordName($parts[0]); //print "Using [{$d->lexicon}] for " . mb_substr($d->internalRep, 0, 50) . "\n"; } else { $d->status = ST_PENDING; } } } if ($d->lexicon) { $lexems = Lexem::loadByUnaccented($d->lexicon); if (!count($lexems)) { $lexem = Lexem::create($d->lexicon, 'T', '1', ''); $lexem->save(); $lexem->id = db_getLastInsertedId(); $lexem->regenerateParadigm();