<?php // Now that a Concept maps several Words to several Definitions, we // can no longer use the first Word to compute the Lexicon field. We have to // extract it from the definition. require_once "../../phplib/util.php"; ini_set('max_execution_time', '3600'); $dbResult = mysql_query("select * from Definition"); $numRows = mysql_num_rows($dbResult); $i = 0; while ($dbRow = mysql_fetch_assoc($dbResult)) { $def = new Definition(); $def->populateFromDbRow($dbRow); $def->lexicon = text_extractLexicon($def); // write a custom query so we don't update the ModDate field (also for speed) mysql_query(sprintf("update Definition set Lexicon = '%s' where Id = '%s'", addslashes($def->lexicon), $def->id)); $i++; if ($i % 1000 == 0) { print "{$i}/{$numRows} definitions processed.\n"; } } mysql_free_result($dbResult);
$linesSeen = 0; $skipped = 0; $existing = loadExistingMapByLexicon(); //$existing = array(); print "Importing " . count($lines) . " lines\n"; foreach ($lines as $count => $line) { $line = trim($line); $line = str_replace(array(chr(0x96), chr(0x84), chr(0x93), chr(228), chr(0xd) . ' ()', chr(146), chr(246), chr(160), chr(180), chr(239), chr(251), chr(252), chr(244), chr(234), chr(224), chr(145), chr(235), chr(199), chr(241), chr(154), chr(230), chr(201), chr(196), chr(171), chr(187), chr(211), chr(167), chr(151), chr(249), chr(0xd) . chr(0) . ' ()', chr(214), '\\', chr(168), '<', ' I.@ ', ' S. M.@ ', ' S. F.@ ', ' S. N.@ ', ' VB.@ ', ' LOC@ ', '; -~A@ '), array('-', '"', '"', ':a', '', "'", ':o', '', "'", ':i', '^u', ':u', '^o', '^e', '`a', ',', ':e', ',c', '~n', '', ',T', "'E", ':A', '\\201c', '\\201e', "'O", '\\00a7', '\\00b6', '`u', '', ':O', ',', '\\00a8', '< ', '@ I. ', '@ $s. m.$ ', '@ $s. f.$ ', '@ $s. n.$ ', '@ $vb.$ ', '@ $loc.$ ', ', -~A@ '), $line); verify_alpha($line, $count); $line = text_internalizeDefinition($line); $d = new Definition(); $d->userId = $radu->id; $d->sourceId = $mdnSrc->id; $d->internalRep = $line; $d->htmlRep = text_htmlize($line); $d->lexicon = text_extractLexicon($d); $d->status = ST_ACTIVE; if (array_key_exists($d->lexicon, $existing)) { $skipped++; } else { // Handle some special cases where the lexicon needs to be adjusted. if (preg_match("/^@[^@,]+ /", $d->internalRep)) { $pos = strpos($d->internalRep, '@', 1); $text = substr($d->internalRep, 1, $pos - 1); $parts = split(' ', $text); assert(count($parts) >= 2); if ($parts[count($parts) - 1] == 'II.') { $d->lexicon = text_internalizeWordName($parts[0]); $d->status = ST_PENDING; } else { if (count($parts) == 2 && text_endsWith($parts[1], '/')) {