Example #1
0
print "Examining " . mysql_num_rows($dbResult) . " definitions.\n";
$count = 0;
$dropped = 0;
$kept = 0;
while (($dbRow = mysql_fetch_assoc($dbResult)) != null) {
    $d = new Definition();
    $d->populateFromDbRow($dbRow);
    $count++;
    if ($count % 10000 == 0) {
        print "{$count} definitions examined, {$dropped} dropped, {$kept} kept\n";
    }
    $newRep = cleanupReferences($d->internalRep);
    if ($newRep != $d->internalRep) {
        //print "{$d->internalRep}\n{$newRep}\n";
        $d->internalRep = $newRep;
        $d->htmlRep = text_htmlize($d->internalRep);
        $d->save();
    }
}
print "{$count} definitions examined, {$dropped} dropped, {$kept} kept\n";
function cleanupReferences($s)
{
    global $dropped;
    global $kept;
    $result = '';
    $text = '';
    $ref = '';
    $mode = 0;
    // 0 = not between bars; 1 = text; 2 = reference
    for ($i = 0; $i < strlen($s); $i++) {
        $char = $s[$i];
Example #2
0
                }
            }
            foreach ($defs as $fixDef) {
                $fixFirstAt = strpos($fixDef->internalRep, '@');
                $fixSecondAt = strpos($fixDef->internalRep, '@', $fixFirstAt + 1);
                assert($fixFirstAt === 0);
                assert($fixSecondAt !== false);
                $fixText = trim(substr($fixDef->internalRep, $fixFirstAt + 1, $fixSecondAt - $fixFirstAt - 1));
                if (!text_contains($fixText, '-') && !text_contains($fixText, ' ') && str_replace('-', '', $normText) == text_unicodeToLower(text_removeAccents($fixText))) {
                    $prevPos = 0;
                    while (($pos = mb_strpos($normText, '-', $prevPos)) !== false) {
                        $fixText = text_insert($fixText, '-', $pos);
                        $prevPos = $pos + 1;
                    }
                    $fixDef->internalRep = substr($fixDef->internalRep, 0, $fixFirstAt + 1) . $fixText . substr($fixDef->internalRep, $fixSecondAt);
                    $fixDef->htmlRep = text_htmlize($fixDef->internalRep);
                    $fixDef->save();
                    print "    [{$fixDef->internalRep}]\n";
                }
            }
            $split++;
            $l->delete();
            $foundHyphenation = true;
        }
    }
    //print "NOT OK: {$l->unaccented}\n";
}
print "Seen {$seen} lexems, split {$split} lexems.\n";
function getNouns($word)
{
    $lexems = Lexem::searchWordlists($word, true);
Example #3
0
<?php

define('NOTE', ' ?[(]?Notă: Definiția este preluată din Dicționar enciclopedic vol\\. .*, Editura Enciclopedică, ....[)]\\.?$');
$query = "select * from Definition where InternalRep rlike '" . NOTE . "'";
$dbResult = mysql_query($query);
$count = 0;
while ($dbRow = mysql_fetch_assoc($dbResult)) {
    $def = Definition::createFromDbRow($dbRow);
    $pos = strpos($def->internalRep, 'Notă:');
    $rep = substr($def->internalRep, 0, $pos);
    if (text_endsWith($rep, '(')) {
        $rep = substr($rep, 0, -1);
    }
    $rep = trim($rep);
    $def->internalRep = $rep;
    $definition->htmlRep = text_htmlize($def->internalRep);
    $def->sourceId = 25;
    $def->save();
    $count++;
}
print "{$count} definitions modified.\n";
Example #4
0
<?php

// We changed the code generated for spaced text. Therefore, all definitions containing % signs need to be re-htmlized
$dbResult = mysql_query('select * from Definition where InternalRep like "%|%"');
$count = 0;
while ($dbRow = mysql_fetch_assoc($dbResult)) {
    $def = Definition::createFromDbRow($dbRow);
    $htmlRep = text_htmlize($def->internalRep);
    if ($htmlRep !== $def->htmlRep) {
        $def->htmlRep = $htmlRep;
        $def->save();
        $count++;
    }
}
print "{$count} definitions converted.\n";
Example #5
0
<?php

// We want the internalRep field to contain ONLY Unicode.
// TODO: Fix all tables, not just definitions.
require_once "../../phplib/util.php";
ini_set('max_execution_time', '3600');
$GLOBALS['htmlEntities'] = array();
$dbResult = mysql_query("select * from Definition");
$numRows = mysql_num_rows($dbResult);
$i = 0;
$changed = 0;
while ($dbRow = mysql_fetch_assoc($dbResult)) {
    $def = new Definition();
    $def->populateFromDbRow($dbRow);
    $newInternalRep = myConvert($def->internalRep);
    $newHtmlRep = text_htmlize($newInternalRep);
    $internalRepChanged = $newInternalRep != $def->internalRep;
    $htmlRepChanged = $newHtmlRep != $def->htmlRep;
    if ($internalRepChanged || $htmlRepChanged) {
        // write a custom query so we don't update the ModDate field
        // (also for speed)
        $query = sprintf("update Definition set InternalRep = '%s', " . "HtmlRep = '%s' where Id = '%d'", addslashes($newInternalRep), addslashes($newHtmlRep), $def->id);
        mysql_query($query);
        $changed++;
        print "Changed definition " . $def->id;
        if ($htmlRepChanged) {
            print " (HtmlRep has changed)";
        }
        print "\n";
    }
    $i++;
Example #6
0
<?php

// After migrating the entire schema to UTF8, some definitions were
// converted badly.
require_once "../../phplib/util.php";
$data = file('data.txt');
$i = 0;
$numLines = count($data);
foreach ($data as $line) {
    $i++;
    $components = split('\\|\\|\\|\\|\\|', $line);
    $id = $components[0];
    $internalRep = $components[1];
    $internalRep = str_replace(chr(0x96), '-', $internalRep);
    $internalRep = str_replace('â' . chr(0x80) . chr(0x93), '-', $internalRep);
    $internalRep = str_replace('\\', '', $internalRep);
    $internalRep = trim($internalRep);
    print "Patching definition {$i}/{$numLines} ({$id})\n";
    $def = Definition::load($id);
    $def->internalRep = $internalRep;
    $def->htmlRep = text_htmlize($internalRep);
    $def->lexicon = text_extractLexicon($def);
    $def->save();
}
Example #7
0
                } else {
                    $newRep .= $chunk;
                }
            } else {
                $newRep .= $chunk;
            }
            $i = $j - 1;
        } else {
            $newRep .= $c;
        }
        $prevC = $c;
    }
    if ($newRep != $rep) {
        //print "Rep: {$rep}\nNew rep: {$newRep}\n";
        $def->internalRep = $newRep;
        $def->htmlRep = text_htmlize($newRep);
        $def->save();
    }
}
/********************************************************/
/**
 * 
 */
function matchesWithAccent($form, $suffix)
{
    $suffix = text_unicodeToLower($suffix);
    $suffixExpl = str_replace($GLOBALS['text_accented'], $GLOBALS['text_explicitAccent'], $suffix);
    $formHasAccent = strstr($form, "'") !== false;
    $suffixHasAccent = strstr($suffixExpl, "'") !== false;
    if ($formHasAccent && $suffixHasAccent) {
        $formImpl = str_replace($GLOBALS['text_explicitAccent'], $GLOBALS['text_accented'], $form);
Example #8
0
<?php

require_once "../../phplib/util.php";
$result = logged_query("select * from words " . "where def like '%<%' or def like '%>%'");
$count = 0;
while ($row = mysql_fetch_assoc($result)) {
    $def = $row['def'];
    $htmlDef = text_htmlize($def);
    logged_query("update words set htmlDef = '" . addslashes($htmlDef) . "' " . "where counter = " . $row['counter']);
    $count++;
    if ($count % 50 == 0) {
        echo "Processed {$count} definitions.\n";
    }
}
echo "Done! Processed {$count} definitions.\n";
Example #9
0
$lines = file($fileName);
$linesSeen = 0;
$skipped = 0;
$existing = loadExistingMapByLexicon();
//$existing = array();
print "Importing " . count($lines) . " lines\n";
foreach ($lines as $count => $line) {
    $line = trim($line);
    $line = str_replace(array(chr(0x96), chr(0x84), chr(0x93), chr(228), chr(0xd) . ' ()', chr(146), chr(246), chr(160), chr(180), chr(239), chr(251), chr(252), chr(244), chr(234), chr(224), chr(145), chr(235), chr(199), chr(241), chr(154), chr(230), chr(201), chr(196), chr(171), chr(187), chr(211), chr(167), chr(151), chr(249), chr(0xd) . chr(0) . ' ()', chr(214), '\\', chr(168), '&lt;', ' I.@ ', ' S. M.@ ', ' S. F.@ ', ' S. N.@ ', ' VB.@ ', ' LOC@ ', '; -~A@ '), array('-', '"', '"', ':a', '', "'", ':o', '', "'", ':i', '^u', ':u', '^o', '^e', '`a', ',', ':e', ',c', '~n', '', ',T', "'E", ':A', '\\201c', '\\201e', "'O", '\\00a7', '\\00b6', '`u', '', ':O', ',', '\\00a8', '< ', '@ I. ', '@ $s. m.$ ', '@ $s. f.$ ', '@ $s. n.$ ', '@ $vb.$ ', '@ $loc.$ ', ', -~A@ '), $line);
    verify_alpha($line, $count);
    $line = text_internalizeDefinition($line);
    $d = new Definition();
    $d->userId = $radu->id;
    $d->sourceId = $mdnSrc->id;
    $d->internalRep = $line;
    $d->htmlRep = text_htmlize($line);
    $d->lexicon = text_extractLexicon($d);
    $d->status = ST_ACTIVE;
    if (array_key_exists($d->lexicon, $existing)) {
        $skipped++;
    } else {
        // Handle some special cases where the lexicon needs to be adjusted.
        if (preg_match("/^@[^@,]+ /", $d->internalRep)) {
            $pos = strpos($d->internalRep, '@', 1);
            $text = substr($d->internalRep, 1, $pos - 1);
            $parts = split(' ', $text);
            assert(count($parts) >= 2);
            if ($parts[count($parts) - 1] == 'II.') {
                $d->lexicon = text_internalizeWordName($parts[0]);
                $d->status = ST_PENDING;
            } else {