/** * Verify consistency of dedup records links with actual records * * @return void */ public function checkDedupRecords() { $this->log->log('checkDedupRecords', "Checking dedup record consistency"); $dedupRecords = $this->db->dedup->find(); $dedupRecords->immortal(true)->timeout($this->cursorTimeout); $count = 0; $fixed = 0; $pc = new PerformanceCounter(); foreach ($dedupRecords as $dedupRecord) { $results = $this->dedupHandler->checkDedupRecord($dedupRecord); if ($results) { $fixed += count($results); foreach ($results as $result) { $this->log->log('checkDedupRecords', $result); } } ++$count; if ($count % 1000 == 0) { $pc->add($count); $avg = $pc->getSpeed(); $this->log->log('checkDedupRecords', "{$count} records checked with {$fixed} links fixed, " . "{$avg} records/sec"); } } $this->log->log('checkDedupRecords', "Completed with {$count} records checked with {$fixed} links fixed"); }
/** * Update Solr index (merged records and individual records) * * @param string|null $fromDate Starting date for updates (if empty * string, last update date stored in the database * is used and if null, all records are processed) * @param string $sourceId Comma-separated list of source IDs to update, * or empty or * for all sources * @param string $singleId Export only a record with the given ID * @param bool $noCommit If true, changes are not explicitly committed * @param bool $delete If true, records in the given $sourceId are all * deleted * @param string $compare If set, just compare the records with the ones * already in the Solr index and write any * differences in a file given in this parameter * @param string $dumpPrefix If specified, the Solr records are dumped into * files and not sent to Solr. * * @return void */ public function updateRecords($fromDate = null, $sourceId = '', $singleId = '', $noCommit = false, $delete = false, $compare = false, $dumpPrefix = '') { if ($compare && $compare != '-') { file_put_contents($compare, ''); } $this->dumpPrefix = $dumpPrefix; $verb = $compare ? 'compared' : ($this->dumpPrefix ? 'dumped' : 'indexed'); $initVerb = $compare ? 'Comparing' : ($this->dumpPrefix ? 'Dumping' : 'Indexing'); $childPid = null; try { if ($this->backgroundUpdates && !$compare) { $this->log->log('updateRecords', "Using {$this->backgroundUpdates} thread(s) for updates"); } $needCommit = false; if (isset($fromDate) && $fromDate) { $mongoFromDate = new MongoDate(strtotime($fromDate)); } if (!isset($fromDate)) { $state = $this->db->state->find(['_id' => 'Last Index Update'])->limit(-1)->timeout($this->cursorTimeout)->getNext(); if (isset($state)) { $mongoFromDate = $state['value']; } else { unset($mongoFromDate); } } $from = isset($mongoFromDate) ? date('Y-m-d H:i:s', $mongoFromDate->sec) : 'the beginning'; // Take the last indexing date now and store it when done $lastIndexingDate = new MongoDate(); if (!$delete && $this->threadedMergedRecordUpdate && !$compare) { $childPid = pcntl_fork(); if ($childPid == -1) { throw new Exception("Could not fork merged record background update child"); } } if (!$childPid) { $needCommit = $this->processMerged(isset($mongoFromDate) ? $mongoFromDate : null, $sourceId, $singleId, $noCommit, $delete, $compare); if ($childPid !== null) { exit($needCommit ? 1 : 0); } } if ($delete) { return; } $this->log->log('updateRecords', "Creating individual record list (from {$from})"); $params = []; if ($singleId) { $params['_id'] = $singleId; $params['dedup_id'] = ['$exists' => false]; $lastIndexingDate = null; } else { if (isset($mongoFromDate)) { $params['updated'] = ['$gte' => $mongoFromDate]; } if ($sourceId) { $sources = explode(',', $sourceId); if (count($sources) == 1) { $params['source_id'] = $sourceId; } else { $sourceParams = []; foreach ($sources as $source) { $sourceParams[] = ['source_id' => $source]; } $params['$or'] = $sourceParams; } } $params['dedup_id'] = ['$exists' => false]; $params['update_needed'] = false; } $records = $this->db->record->find($params)->timeout($this->cursorTimeout); $records->immortal(true); $total = $this->counts ? $records->count() : 'the'; $count = 0; $mergedComponents = 0; $deleted = 0; if ($noCommit) { $this->log->log('updateRecords', "{$initVerb} {$total} individual records (with no forced commits)"); } else { $this->log->log('updateRecords', "{$initVerb} {$total} individual records (max commit interval " . "{$this->commitInterval} records)"); } $pc = new PerformanceCounter(); $this->initBufferedUpdate(); foreach ($records as $record) { if (isset($this->terminate)) { if ($childPid) { $this->log->log('updateRecords', 'Waiting for child process to terminate...'); while (1) { $pid = pcntl_waitpid($childPid, $status, WNOHANG); if ($pid > 0) { break; } sleep(10); } } $this->log->log('updateRecords', 'Termination upon request (individual record handler)'); exit(1); } if (isset($record['update_needed']) && $record['update_needed']) { $this->log->log('updateRecords', "Record {$record['_id']} needs deduplication and would not" . " be processed in a normal update", Logger::WARNING); } if ($record['deleted']) { if (!$compare) { $this->bufferedDelete((string) $record['_id']); } ++$count; ++$deleted; } else { $data = $this->createSolrArray($record, $mergedComponents); if ($data === false) { continue; } if ($this->verbose) { echo "Metadata for record {$record['_id']}: \n"; $this->prettyPrint($data); } ++$count; if (!$compare) { $res = $this->bufferedUpdate($data, $count, $childPid || $noCommit); } else { $res = $count % 1000 == 0; $this->compareWithSolrRecord($data, $compare); } if ($res) { $pc->add($count); $avg = $pc->getSpeed(); $this->log->log('updateRecords', "{$count} individual records (of which {$deleted} deleted) " . "with {$mergedComponents} merged parts {$verb}, " . "{$avg} records/sec"); // Check child status if ($childPid) { $pid = pcntl_waitpid($childPid, $status, WNOHANG); if ($pid > 0) { $childPid = null; $exitCode = pcntl_wexitstatus($status); if ($exitCode == 1) { $needCommit = true; } elseif ($exitCode) { $this->log->log('updateRecords', "Merged record update thread failed, " . "aborting", Logger::ERROR); throw new Exception('Merged record update thread failed'); } } } } } } $this->flushUpdateBuffer(); if (isset($lastIndexingDate) && !$compare) { $state = ['_id' => "Last Index Update", 'value' => $lastIndexingDate]; $this->db->state->save($state); } if ($count > 0) { $needCommit = true; } $this->log->log('updateRecords', "Total {$count} individual records (of which {$deleted} deleted) with " . "{$mergedComponents} merged parts {$verb}"); if ($childPid) { // Wait for child to finish while (1) { $pid = pcntl_waitpid($childPid, $status, WNOHANG); if ($pid > 0) { if (pcntl_wexitstatus($status) == 1) { $needCommit = true; } break; } sleep(10); } } if (!$noCommit && $needCommit && !$compare && !$this->dumpPrefix) { $this->waitForHttpChildren(); $this->log->log('updateRecords', "Final commit..."); $this->solrRequest('{ "commit": {} }', 3600); $this->waitForHttpChildren(); $this->log->log('updateRecords', "Commit complete"); } } catch (Exception $e) { $this->log->log('updateRecords', 'Exception: ' . $e->getMessage() . ' at ' . $e->getFile() . ':' . $e->getLine(), Logger::FATAL); if ($childPid) { // Kill the child process too posix_kill($childPid, SIGINT); // Wait for child to finish while (1) { $pid = pcntl_waitpid($childPid, $status, WNOHANG); if ($pid > 0) { break; } sleep(10); } } if ($this->threadedMergedRecordUpdate && !$childPid) { exit(2); } } }