print "Examining " . mysql_num_rows($dbResult) . " definitions.\n"; $count = 0; $dropped = 0; $kept = 0; while (($dbRow = mysql_fetch_assoc($dbResult)) != null) { $d = new Definition(); $d->populateFromDbRow($dbRow); $count++; if ($count % 10000 == 0) { print "{$count} definitions examined, {$dropped} dropped, {$kept} kept\n"; } $newRep = cleanupReferences($d->internalRep); if ($newRep != $d->internalRep) { //print "{$d->internalRep}\n{$newRep}\n"; $d->internalRep = $newRep; $d->htmlRep = text_htmlize($d->internalRep); $d->save(); } } print "{$count} definitions examined, {$dropped} dropped, {$kept} kept\n"; function cleanupReferences($s) { global $dropped; global $kept; $result = ''; $text = ''; $ref = ''; $mode = 0; // 0 = not between bars; 1 = text; 2 = reference for ($i = 0; $i < strlen($s); $i++) { $char = $s[$i];
} } foreach ($defs as $fixDef) { $fixFirstAt = strpos($fixDef->internalRep, '@'); $fixSecondAt = strpos($fixDef->internalRep, '@', $fixFirstAt + 1); assert($fixFirstAt === 0); assert($fixSecondAt !== false); $fixText = trim(substr($fixDef->internalRep, $fixFirstAt + 1, $fixSecondAt - $fixFirstAt - 1)); if (!text_contains($fixText, '-') && !text_contains($fixText, ' ') && str_replace('-', '', $normText) == text_unicodeToLower(text_removeAccents($fixText))) { $prevPos = 0; while (($pos = mb_strpos($normText, '-', $prevPos)) !== false) { $fixText = text_insert($fixText, '-', $pos); $prevPos = $pos + 1; } $fixDef->internalRep = substr($fixDef->internalRep, 0, $fixFirstAt + 1) . $fixText . substr($fixDef->internalRep, $fixSecondAt); $fixDef->htmlRep = text_htmlize($fixDef->internalRep); $fixDef->save(); print " [{$fixDef->internalRep}]\n"; } } $split++; $l->delete(); $foundHyphenation = true; } } //print "NOT OK: {$l->unaccented}\n"; } print "Seen {$seen} lexems, split {$split} lexems.\n"; function getNouns($word) { $lexems = Lexem::searchWordlists($word, true);
<?php define('NOTE', ' ?[(]?Notă: Definiția este preluată din Dicționar enciclopedic vol\\. .*, Editura Enciclopedică, ....[)]\\.?$'); $query = "select * from Definition where InternalRep rlike '" . NOTE . "'"; $dbResult = mysql_query($query); $count = 0; while ($dbRow = mysql_fetch_assoc($dbResult)) { $def = Definition::createFromDbRow($dbRow); $pos = strpos($def->internalRep, 'Notă:'); $rep = substr($def->internalRep, 0, $pos); if (text_endsWith($rep, '(')) { $rep = substr($rep, 0, -1); } $rep = trim($rep); $def->internalRep = $rep; $definition->htmlRep = text_htmlize($def->internalRep); $def->sourceId = 25; $def->save(); $count++; } print "{$count} definitions modified.\n";
<?php // We changed the code generated for spaced text. Therefore, all definitions containing % signs need to be re-htmlized $dbResult = mysql_query('select * from Definition where InternalRep like "%|%"'); $count = 0; while ($dbRow = mysql_fetch_assoc($dbResult)) { $def = Definition::createFromDbRow($dbRow); $htmlRep = text_htmlize($def->internalRep); if ($htmlRep !== $def->htmlRep) { $def->htmlRep = $htmlRep; $def->save(); $count++; } } print "{$count} definitions converted.\n";
<?php // We want the internalRep field to contain ONLY Unicode. // TODO: Fix all tables, not just definitions. require_once "../../phplib/util.php"; ini_set('max_execution_time', '3600'); $GLOBALS['htmlEntities'] = array(); $dbResult = mysql_query("select * from Definition"); $numRows = mysql_num_rows($dbResult); $i = 0; $changed = 0; while ($dbRow = mysql_fetch_assoc($dbResult)) { $def = new Definition(); $def->populateFromDbRow($dbRow); $newInternalRep = myConvert($def->internalRep); $newHtmlRep = text_htmlize($newInternalRep); $internalRepChanged = $newInternalRep != $def->internalRep; $htmlRepChanged = $newHtmlRep != $def->htmlRep; if ($internalRepChanged || $htmlRepChanged) { // write a custom query so we don't update the ModDate field // (also for speed) $query = sprintf("update Definition set InternalRep = '%s', " . "HtmlRep = '%s' where Id = '%d'", addslashes($newInternalRep), addslashes($newHtmlRep), $def->id); mysql_query($query); $changed++; print "Changed definition " . $def->id; if ($htmlRepChanged) { print " (HtmlRep has changed)"; } print "\n"; } $i++;
<?php // After migrating the entire schema to UTF8, some definitions were // converted badly. require_once "../../phplib/util.php"; $data = file('data.txt'); $i = 0; $numLines = count($data); foreach ($data as $line) { $i++; $components = split('\\|\\|\\|\\|\\|', $line); $id = $components[0]; $internalRep = $components[1]; $internalRep = str_replace(chr(0x96), '-', $internalRep); $internalRep = str_replace('â' . chr(0x80) . chr(0x93), '-', $internalRep); $internalRep = str_replace('\\', '', $internalRep); $internalRep = trim($internalRep); print "Patching definition {$i}/{$numLines} ({$id})\n"; $def = Definition::load($id); $def->internalRep = $internalRep; $def->htmlRep = text_htmlize($internalRep); $def->lexicon = text_extractLexicon($def); $def->save(); }
} else { $newRep .= $chunk; } } else { $newRep .= $chunk; } $i = $j - 1; } else { $newRep .= $c; } $prevC = $c; } if ($newRep != $rep) { //print "Rep: {$rep}\nNew rep: {$newRep}\n"; $def->internalRep = $newRep; $def->htmlRep = text_htmlize($newRep); $def->save(); } } /********************************************************/ /** * */ function matchesWithAccent($form, $suffix) { $suffix = text_unicodeToLower($suffix); $suffixExpl = str_replace($GLOBALS['text_accented'], $GLOBALS['text_explicitAccent'], $suffix); $formHasAccent = strstr($form, "'") !== false; $suffixHasAccent = strstr($suffixExpl, "'") !== false; if ($formHasAccent && $suffixHasAccent) { $formImpl = str_replace($GLOBALS['text_explicitAccent'], $GLOBALS['text_accented'], $form);
<?php require_once "../../phplib/util.php"; $result = logged_query("select * from words " . "where def like '%<%' or def like '%>%'"); $count = 0; while ($row = mysql_fetch_assoc($result)) { $def = $row['def']; $htmlDef = text_htmlize($def); logged_query("update words set htmlDef = '" . addslashes($htmlDef) . "' " . "where counter = " . $row['counter']); $count++; if ($count % 50 == 0) { echo "Processed {$count} definitions.\n"; } } echo "Done! Processed {$count} definitions.\n";
$lines = file($fileName); $linesSeen = 0; $skipped = 0; $existing = loadExistingMapByLexicon(); //$existing = array(); print "Importing " . count($lines) . " lines\n"; foreach ($lines as $count => $line) { $line = trim($line); $line = str_replace(array(chr(0x96), chr(0x84), chr(0x93), chr(228), chr(0xd) . ' ()', chr(146), chr(246), chr(160), chr(180), chr(239), chr(251), chr(252), chr(244), chr(234), chr(224), chr(145), chr(235), chr(199), chr(241), chr(154), chr(230), chr(201), chr(196), chr(171), chr(187), chr(211), chr(167), chr(151), chr(249), chr(0xd) . chr(0) . ' ()', chr(214), '\\', chr(168), '<', ' I.@ ', ' S. M.@ ', ' S. F.@ ', ' S. N.@ ', ' VB.@ ', ' LOC@ ', '; -~A@ '), array('-', '"', '"', ':a', '', "'", ':o', '', "'", ':i', '^u', ':u', '^o', '^e', '`a', ',', ':e', ',c', '~n', '', ',T', "'E", ':A', '\\201c', '\\201e', "'O", '\\00a7', '\\00b6', '`u', '', ':O', ',', '\\00a8', '< ', '@ I. ', '@ $s. m.$ ', '@ $s. f.$ ', '@ $s. n.$ ', '@ $vb.$ ', '@ $loc.$ ', ', -~A@ '), $line); verify_alpha($line, $count); $line = text_internalizeDefinition($line); $d = new Definition(); $d->userId = $radu->id; $d->sourceId = $mdnSrc->id; $d->internalRep = $line; $d->htmlRep = text_htmlize($line); $d->lexicon = text_extractLexicon($d); $d->status = ST_ACTIVE; if (array_key_exists($d->lexicon, $existing)) { $skipped++; } else { // Handle some special cases where the lexicon needs to be adjusted. if (preg_match("/^@[^@,]+ /", $d->internalRep)) { $pos = strpos($d->internalRep, '@', 1); $text = substr($d->internalRep, 1, $pos - 1); $parts = split(' ', $text); assert(count($parts) >= 2); if ($parts[count($parts) - 1] == 'II.') { $d->lexicon = text_internalizeWordName($parts[0]); $d->status = ST_PENDING; } else {