/** * Im Rahmen der Zuweisung von Dokumenten, die Collections der Collection Role * series zugeordnet sind, müssen verschiedene Konflikte behandelt werden. * * Im Folgenden werden nur Dokumente betrachtet, die mindestens einer Collection * der Collection Role series (kurz: series-Collection) zugeordnet sind. * * Fall 1 (Dokumente ohne IdentifierSerial): * Da die Bandnummer einer Schriftenreihe Opus_Series obligatorisch ist, können * Dokumente ohne IdentifierSerial nicht migriert werden. Sie verbleiben * unangetastet. Die Zuweisung(en) zu series-Collection(s) wird (werden) nicht * verändert. * * Fall 2 (Dokumente mit mehr als einem IdentifierSerial): * Da ein Dokument pro Schriftenreihe nur eine Bandnummer besitzen kann, können * Dokumente mit mehr als einem Wert für das Feld IdentifierSerial nicht * migriert werden. Sie verbleiben unangetastet. Die Zuweisung(en) zu * series-Collection(s) wird (werden) nicht verändert. * * Fall 3 (Dokumente mit einem IdentifierSerial): * Da in einer Schriftenreihe nicht zwei Dokumente mit der gleichen Bandnummer * existieren können, muss beim Zuweisen von Dokumenten darauf geachtet werden, * dass eine Bandnummer nicht mehrfach vergeben wird. * Wird versucht ein Dokument zu einer Schriftenreihe mit einer bereits * in Benutzung befindlichen Bandnummer zuzuweisen, so wird die Zuweisung * nicht durchgeführt. Die Zuweisung des Dokuments zur series-Collection wird * in diesem Fall unverändert beibehalten. * * Im Falle der erfolgreichen Zuweisung des Dokuments zu einer Schriftenreihe * wird die Verknüpfung mit der korrespondierenden series-Collection * entfernt. Außerdem wird das Feld IdentifierSerial entfernt. * * * @return array an array that contains both the number of conflicts found and * the number of documents that were successfully migrated */ private function migrateDocuments() { $numOfConflicts = 0; $numOfDocsMigrated = 0; $finder = new Opus_DocumentFinder(); $finder->setCollectionRoleId($this->seriesRole->getId()); $serialIdsInUse = array(); foreach ($finder->ids() as $docId) { $doc = new Opus_Document($docId); $serialIds = $doc->getIdentifierSerial(); $numOfSerialIds = count($serialIds); if ($numOfSerialIds == 0) { $this->logger->warn("doc #{$docId} : does not have a field IdentifierSerial -- leave it untouched"); $numOfConflicts++; continue; } if ($numOfSerialIds > 1) { $this->logger->warn("doc #{$docId} : has {$numOfSerialIds} values for field IdentifierSerial -- leave it untouched"); $numOfConflicts++; continue; } $serialId = $serialIds[0]->getValue(); $remainingCollections = array(); foreach ($doc->getCollection() as $collection) { // only consider collection in collection role series if ($collection->getRoleId() != $this->seriesRole->getId()) { array_push($remainingCollections, $collection); } else { $collectionId = $collection->getId(); if (!$collection->isRoot()) { // check for conflict if (array_key_exists($collectionId, $serialIdsInUse) && in_array($serialId, $serialIdsInUse[$collectionId])) { // conflict was found: serialId for series $collectionId already in use $this->logger->warn("doc #{$docId} : could not assign to series #{$collectionId}: value {$serialId} already in use"); $this->logger->warn("doc #{$docId} : leave assignment to collection #{$collectionId} untouched"); array_push($remainingCollections, $collection); $numOfConflicts++; } else { // no conflict $series = new Opus_Series($collectionId); $doc->addSeries($series)->setNumber($serialId); $doc->setIdentifierSerial(array()); // mark usage of serialId for collection $collectionId if (array_key_exists($collectionId, $serialIdsInUse)) { array_push($serialIdsInUse[$collectionId], $serialId); } else { $serialIdsInUse[$collectionId] = array($serialId); } $this->logger->info("doc #{$docId} : assign document to series #{$collectionId} with value {$serialId}"); $this->logger->info("doc #{$docId} : removed assignment from collection #{$collectionId}"); $this->logger->info("doc #{$docId} : removed field IdentifierSerial with value " . $serialId); $numOfDocsMigrated++; } } else { // series root collection assignment will not be migrated $this->logger->warn("doc #{$docId} : is assigned to root collection #{$collectionId} of collection role series: leave assignment untouched"); array_push($remainingCollections, $collection); $numOfConflicts++; } } } $doc->setCollection($remainingCollections); $doc->unregisterPlugin('Opus_Document_Plugin_Index'); $doc->store(); } return array('numOfConflicts' => $numOfConflicts, 'numOfDocsMigrated' => $numOfDocsMigrated); }
private function index($startId, $endId) { $this->forceSyncMode(); $docIds = $this->getDocumentIds($startId, $endId); $indexer = Opus_Search_Service::selectIndexingService('indexBuilder'); if (!$this->_deleteAllDocs) { $indexer->removeAllDocumentsFromIndex(); } echo date('Y-m-d H:i:s') . " Start indexing of " . count($docIds) . " documents.\n"; $numOfDocs = 0; $runtime = microtime(true); $docs = array(); // measure time for each document foreach ($docIds as $docId) { $timeStart = microtime(true); $doc = new Opus_Document($docId); // dirty hack: disable implicit reindexing of documents in case of cache misses $doc->unregisterPlugin('Opus_Document_Plugin_Index'); $docs[] = $doc; $timeDelta = microtime(true) - $timeStart; if ($timeDelta > 30) { echo date('Y-m-d H:i:s') . " WARNING: Indexing document {$docId} took {$timeDelta} seconds.\n"; } $numOfDocs++; if ($numOfDocs % 10 == 0) { $this->addDocumentsToIndex($indexer, $docs); $docs = array(); $this->outputProgress($runtime, $numOfDocs); } } // Index leftover documents if (count($docs) > 0) { $this->addDocumentsToIndex($indexer, $docs); $this->outputProgress($runtime, $numOfDocs); } $runtime = microtime(true) - $runtime; echo PHP_EOL . date('Y-m-d H:i:s') . ' Finished indexing.' . PHP_EOL; // new search API doesn't track number of indexed files, but issues are kept written to log file //echo "\n\nErrors appeared in " . $indexer->getErrorFileCount() . " of " . $indexer->getTotalFileCount() // . " files. Details were written to opus-console.log"; echo PHP_EOL . PHP_EOL . 'Details were written to opus-console.log'; $this->resetMode(); return $runtime; }
/** * Starts an Opus console. */ public function run() { global $argv, $argc; if (true === in_array('--help', $argv) || true === in_array('-h', $argv)) { $this->printHelpMessage($argv); exit; } $this->evaluateArguments($argc, $argv); $this->forceSyncMode(); $docIds = Opus_Document::getAllPublishedIds($this->start, $this->end); $indexer = new Opus_SolrSearch_Index_Indexer($this->deleteAllDocs); //$indexer = new Opus_SolrSearch_Index_Indexer(); echo date('Y-m-d H:i:s') . " Start indexing of " . count($docIds) . " documents.\n"; $numOfDocs = 0; $runtime = microtime(true); foreach ($docIds as $docId) { $time_start = microtime(true); $doc = new Opus_Document($docId); // dirty hack: disable implicit reindexing of documents in case of cache misses $doc->unregisterPlugin('Opus_Document_Plugin_Index'); $indexer->addDocumentToEntryIndex($doc); $time_delta = microtime(true) - $time_start; if ($time_delta > 30) { echo date('Y-m-d H:i:s') . " WARNING: Indexing document {$docId} took {$time_delta} seconds.\n"; } $numOfDocs++; if ($numOfDocs % 10 == 0) { $mem_now = round(memory_get_usage() / 1024 / 1024); $mem_peak = round(memory_get_peak_usage() / 1024 / 1024); $delta_t = microtime(true) - $runtime; $doc_per_second = round($delta_t) == 0 ? 'inf' : round($numOfDocs / $delta_t, 2); $seconds_per_doc = round($delta_t / $numOfDocs, 2); echo date('Y-m-d H:i:s') . " Stats after {$numOfDocs} documents -- memory {$mem_now} MB, peak memory {$mem_peak} (MB), {$doc_per_second} docs/second, {$seconds_per_doc} seconds/doc\n"; } } $runtime = microtime(true) - $runtime; echo "\n" . date('Y-m-d H:i:s') . " Finished indexing.\n"; $indexer->commit(); echo "\n\nErrors appeared in " . $indexer->getErrorFileCount() . " of " . $indexer->getTotalFileCount() . " files." . " Details were written to opus-console.log"; $this->resetMode(); return $runtime; }
/** * Forces the reindexing of the given document. * * @param Opus_Document $doc * @return bool Returns true, iff the given document was successfully updated in Solr index. */ private function forceReindexing($doc) { try { $doc->unregisterPlugin('Opus_Document_Plugin_Index'); // prevent document from being indexed twice $this->indexer->addDocumentToEntryIndex($doc); $this->indexer->commit(); } catch (Opus_SolrSearch_Exception $e) { $this->logger->err('Could not force reindexing of document ' . $doc->getId() . ' : ' . $e->getMessage()); return false; } return true; }