Пример #1
0
 /**
  * Im Rahmen der Zuweisung von Dokumenten, die Collections der Collection Role
  * series zugeordnet sind, müssen verschiedene Konflikte behandelt werden.
  *
  * Im Folgenden werden nur Dokumente betrachtet, die mindestens einer Collection
  * der Collection Role series (kurz: series-Collection) zugeordnet sind.
  *
  * Fall 1 (Dokumente ohne IdentifierSerial):
  * Da die Bandnummer einer Schriftenreihe Opus_Series obligatorisch ist, können
  * Dokumente ohne IdentifierSerial nicht migriert werden. Sie verbleiben
  * unangetastet. Die Zuweisung(en) zu series-Collection(s) wird (werden) nicht
  * verändert.
  *
  * Fall 2 (Dokumente mit mehr als einem IdentifierSerial):
  * Da ein Dokument pro Schriftenreihe nur eine Bandnummer besitzen kann, können
  * Dokumente mit mehr als einem Wert für das Feld IdentifierSerial nicht
  * migriert werden. Sie verbleiben unangetastet. Die Zuweisung(en) zu
  * series-Collection(s) wird (werden) nicht verändert.
  *
  * Fall 3 (Dokumente mit einem IdentifierSerial):
  * Da in einer Schriftenreihe nicht zwei Dokumente mit der gleichen Bandnummer
  * existieren können, muss beim Zuweisen von Dokumenten darauf geachtet werden,
  * dass eine Bandnummer nicht mehrfach vergeben wird.
  * Wird versucht ein Dokument zu einer Schriftenreihe mit einer bereits
  * in Benutzung befindlichen Bandnummer zuzuweisen, so wird die Zuweisung
  * nicht durchgeführt. Die Zuweisung des Dokuments zur series-Collection wird
  * in diesem Fall unverändert beibehalten.
  *
  * Im Falle der erfolgreichen Zuweisung des Dokuments zu einer Schriftenreihe
  * wird die Verknüpfung mit der korrespondierenden series-Collection
  * entfernt. Außerdem wird das Feld IdentifierSerial entfernt.
  *
  *
  * @return array an array that contains both the number of conflicts found and
  * the number of documents that were successfully migrated
  */
 private function migrateDocuments()
 {
     $numOfConflicts = 0;
     $numOfDocsMigrated = 0;
     $finder = new Opus_DocumentFinder();
     $finder->setCollectionRoleId($this->seriesRole->getId());
     $serialIdsInUse = array();
     foreach ($finder->ids() as $docId) {
         $doc = new Opus_Document($docId);
         $serialIds = $doc->getIdentifierSerial();
         $numOfSerialIds = count($serialIds);
         if ($numOfSerialIds == 0) {
             $this->logger->warn("doc #{$docId} : does not have a field IdentifierSerial -- leave it untouched");
             $numOfConflicts++;
             continue;
         }
         if ($numOfSerialIds > 1) {
             $this->logger->warn("doc #{$docId} : has {$numOfSerialIds} values for field IdentifierSerial -- leave it untouched");
             $numOfConflicts++;
             continue;
         }
         $serialId = $serialIds[0]->getValue();
         $remainingCollections = array();
         foreach ($doc->getCollection() as $collection) {
             // only consider collection in collection role series
             if ($collection->getRoleId() != $this->seriesRole->getId()) {
                 array_push($remainingCollections, $collection);
             } else {
                 $collectionId = $collection->getId();
                 if (!$collection->isRoot()) {
                     // check for conflict
                     if (array_key_exists($collectionId, $serialIdsInUse) && in_array($serialId, $serialIdsInUse[$collectionId])) {
                         // conflict was found: serialId for series $collectionId already in use
                         $this->logger->warn("doc #{$docId} : could not assign to series #{$collectionId}: value {$serialId} already in use");
                         $this->logger->warn("doc #{$docId} : leave assignment to collection #{$collectionId} untouched");
                         array_push($remainingCollections, $collection);
                         $numOfConflicts++;
                     } else {
                         // no conflict
                         $series = new Opus_Series($collectionId);
                         $doc->addSeries($series)->setNumber($serialId);
                         $doc->setIdentifierSerial(array());
                         // mark usage of serialId for collection $collectionId
                         if (array_key_exists($collectionId, $serialIdsInUse)) {
                             array_push($serialIdsInUse[$collectionId], $serialId);
                         } else {
                             $serialIdsInUse[$collectionId] = array($serialId);
                         }
                         $this->logger->info("doc #{$docId} : assign document to series #{$collectionId} with value {$serialId}");
                         $this->logger->info("doc #{$docId} : removed assignment from collection #{$collectionId}");
                         $this->logger->info("doc #{$docId} : removed field IdentifierSerial with value " . $serialId);
                         $numOfDocsMigrated++;
                     }
                 } else {
                     // series root collection assignment will not be migrated
                     $this->logger->warn("doc #{$docId} : is assigned to root collection #{$collectionId} of collection role series: leave assignment untouched");
                     array_push($remainingCollections, $collection);
                     $numOfConflicts++;
                 }
             }
         }
         $doc->setCollection($remainingCollections);
         $doc->unregisterPlugin('Opus_Document_Plugin_Index');
         $doc->store();
     }
     return array('numOfConflicts' => $numOfConflicts, 'numOfDocsMigrated' => $numOfDocsMigrated);
 }
Пример #2
0
 private function index($startId, $endId)
 {
     $this->forceSyncMode();
     $docIds = $this->getDocumentIds($startId, $endId);
     $indexer = Opus_Search_Service::selectIndexingService('indexBuilder');
     if (!$this->_deleteAllDocs) {
         $indexer->removeAllDocumentsFromIndex();
     }
     echo date('Y-m-d H:i:s') . " Start indexing of " . count($docIds) . " documents.\n";
     $numOfDocs = 0;
     $runtime = microtime(true);
     $docs = array();
     // measure time for each document
     foreach ($docIds as $docId) {
         $timeStart = microtime(true);
         $doc = new Opus_Document($docId);
         // dirty hack: disable implicit reindexing of documents in case of cache misses
         $doc->unregisterPlugin('Opus_Document_Plugin_Index');
         $docs[] = $doc;
         $timeDelta = microtime(true) - $timeStart;
         if ($timeDelta > 30) {
             echo date('Y-m-d H:i:s') . " WARNING: Indexing document {$docId} took {$timeDelta} seconds.\n";
         }
         $numOfDocs++;
         if ($numOfDocs % 10 == 0) {
             $this->addDocumentsToIndex($indexer, $docs);
             $docs = array();
             $this->outputProgress($runtime, $numOfDocs);
         }
     }
     // Index leftover documents
     if (count($docs) > 0) {
         $this->addDocumentsToIndex($indexer, $docs);
         $this->outputProgress($runtime, $numOfDocs);
     }
     $runtime = microtime(true) - $runtime;
     echo PHP_EOL . date('Y-m-d H:i:s') . ' Finished indexing.' . PHP_EOL;
     // new search API doesn't track number of indexed files, but issues are kept written to log file
     //echo "\n\nErrors appeared in " . $indexer->getErrorFileCount() . " of " . $indexer->getTotalFileCount()
     //    . " files. Details were written to opus-console.log";
     echo PHP_EOL . PHP_EOL . 'Details were written to opus-console.log';
     $this->resetMode();
     return $runtime;
 }
Пример #3
0
 /**
  * Starts an Opus console.     
  */
 public function run()
 {
     global $argv, $argc;
     if (true === in_array('--help', $argv) || true === in_array('-h', $argv)) {
         $this->printHelpMessage($argv);
         exit;
     }
     $this->evaluateArguments($argc, $argv);
     $this->forceSyncMode();
     $docIds = Opus_Document::getAllPublishedIds($this->start, $this->end);
     $indexer = new Opus_SolrSearch_Index_Indexer($this->deleteAllDocs);
     //$indexer = new Opus_SolrSearch_Index_Indexer();
     echo date('Y-m-d H:i:s') . " Start indexing of " . count($docIds) . " documents.\n";
     $numOfDocs = 0;
     $runtime = microtime(true);
     foreach ($docIds as $docId) {
         $time_start = microtime(true);
         $doc = new Opus_Document($docId);
         // dirty hack: disable implicit reindexing of documents in case of cache misses
         $doc->unregisterPlugin('Opus_Document_Plugin_Index');
         $indexer->addDocumentToEntryIndex($doc);
         $time_delta = microtime(true) - $time_start;
         if ($time_delta > 30) {
             echo date('Y-m-d H:i:s') . " WARNING: Indexing document {$docId} took {$time_delta} seconds.\n";
         }
         $numOfDocs++;
         if ($numOfDocs % 10 == 0) {
             $mem_now = round(memory_get_usage() / 1024 / 1024);
             $mem_peak = round(memory_get_peak_usage() / 1024 / 1024);
             $delta_t = microtime(true) - $runtime;
             $doc_per_second = round($delta_t) == 0 ? 'inf' : round($numOfDocs / $delta_t, 2);
             $seconds_per_doc = round($delta_t / $numOfDocs, 2);
             echo date('Y-m-d H:i:s') . " Stats after {$numOfDocs} documents -- memory {$mem_now} MB, peak memory {$mem_peak} (MB), {$doc_per_second} docs/second, {$seconds_per_doc} seconds/doc\n";
         }
     }
     $runtime = microtime(true) - $runtime;
     echo "\n" . date('Y-m-d H:i:s') . " Finished indexing.\n";
     $indexer->commit();
     echo "\n\nErrors appeared in " . $indexer->getErrorFileCount() . " of " . $indexer->getTotalFileCount() . " files." . " Details were written to opus-console.log";
     $this->resetMode();
     return $runtime;
 }
Пример #4
0
 /**
  * Forces the reindexing of the given document.
  *
  * @param Opus_Document $doc
  * @return bool Returns true, iff the given document was successfully updated in Solr index.
  */
 private function forceReindexing($doc)
 {
     try {
         $doc->unregisterPlugin('Opus_Document_Plugin_Index');
         // prevent document from being indexed twice
         $this->indexer->addDocumentToEntryIndex($doc);
         $this->indexer->commit();
     } catch (Opus_SolrSearch_Exception $e) {
         $this->logger->err('Could not force reindexing of document ' . $doc->getId() . ' : ' . $e->getMessage());
         return false;
     }
     return true;
 }