Example #1
0
<?php

// Now that a Concept maps several Words to several Definitions, we
// can no longer use the first Word to compute the Lexicon field. We have to
// extract it from the definition.
require_once "../../phplib/util.php";
ini_set('max_execution_time', '3600');
$dbResult = mysql_query("select * from Definition");
$numRows = mysql_num_rows($dbResult);
$i = 0;
while ($dbRow = mysql_fetch_assoc($dbResult)) {
    $def = new Definition();
    $def->populateFromDbRow($dbRow);
    $def->lexicon = text_extractLexicon($def);
    // write a custom query so we don't update the ModDate field (also for speed)
    mysql_query(sprintf("update Definition set Lexicon = '%s' where Id = '%s'", addslashes($def->lexicon), $def->id));
    $i++;
    if ($i % 1000 == 0) {
        print "{$i}/{$numRows} definitions processed.\n";
    }
}
mysql_free_result($dbResult);
Example #2
0
$linesSeen = 0;
$skipped = 0;
$existing = loadExistingMapByLexicon();
//$existing = array();
print "Importing " . count($lines) . " lines\n";
foreach ($lines as $count => $line) {
    $line = trim($line);
    $line = str_replace(array(chr(0x96), chr(0x84), chr(0x93), chr(228), chr(0xd) . ' ()', chr(146), chr(246), chr(160), chr(180), chr(239), chr(251), chr(252), chr(244), chr(234), chr(224), chr(145), chr(235), chr(199), chr(241), chr(154), chr(230), chr(201), chr(196), chr(171), chr(187), chr(211), chr(167), chr(151), chr(249), chr(0xd) . chr(0) . ' ()', chr(214), '\\', chr(168), '&lt;', ' I.@ ', ' S. M.@ ', ' S. F.@ ', ' S. N.@ ', ' VB.@ ', ' LOC@ ', '; -~A@ '), array('-', '"', '"', ':a', '', "'", ':o', '', "'", ':i', '^u', ':u', '^o', '^e', '`a', ',', ':e', ',c', '~n', '', ',T', "'E", ':A', '\\201c', '\\201e', "'O", '\\00a7', '\\00b6', '`u', '', ':O', ',', '\\00a8', '< ', '@ I. ', '@ $s. m.$ ', '@ $s. f.$ ', '@ $s. n.$ ', '@ $vb.$ ', '@ $loc.$ ', ', -~A@ '), $line);
    verify_alpha($line, $count);
    $line = text_internalizeDefinition($line);
    $d = new Definition();
    $d->userId = $radu->id;
    $d->sourceId = $mdnSrc->id;
    $d->internalRep = $line;
    $d->htmlRep = text_htmlize($line);
    $d->lexicon = text_extractLexicon($d);
    $d->status = ST_ACTIVE;
    if (array_key_exists($d->lexicon, $existing)) {
        $skipped++;
    } else {
        // Handle some special cases where the lexicon needs to be adjusted.
        if (preg_match("/^@[^@,]+ /", $d->internalRep)) {
            $pos = strpos($d->internalRep, '@', 1);
            $text = substr($d->internalRep, 1, $pos - 1);
            $parts = split(' ', $text);
            assert(count($parts) >= 2);
            if ($parts[count($parts) - 1] == 'II.') {
                $d->lexicon = text_internalizeWordName($parts[0]);
                $d->status = ST_PENDING;
            } else {
                if (count($parts) == 2 && text_endsWith($parts[1], '/')) {