/** * Process a complete record set harvested e.g. from MetaLib * * @param string $source Source ID * @param string[] $harvestedRecords Array of records * * @return void */ protected function processFullRecordSet($source, $harvestedRecords) { $this->log->log('processFullRecordSet', "[{$source}] Processing complete record set"); // Create keyed array $records = []; foreach ($harvestedRecords as $record) { $marc = RecordFactory::createRecord('marc', $record, '', $source); $id = $marc->getID(); $records["{$source}.{$id}"] = $record; } $this->log->log('processFullRecordSet', "[{$source}] Merging results with the records in database"); $deleted = 0; $unchanged = 0; $changed = 0; $added = 0; $dbRecords = $this->db->record->find(['deleted' => false, 'source_id' => $source])->timeout($this->cursorTimeout); foreach ($dbRecords as $dbRecord) { $id = $dbRecord['_id']; if (!isset($records[$id])) { // Record not in harvested records, mark deleted $this->storeRecord($id, true, ''); unset($records[$id]); ++$deleted; continue; } // Check if the record has changed $marc = RecordFactory::createRecord('marc', $records[$id], '', $source); if ($marc->serialize() != MetadataUtils::getRecordData($dbRecord, false)) { // Record changed, update... $this->storeRecord($id, false, $records[$id]); ++$changed; } else { ++$unchanged; } unset($records[$id]); } $this->log->log('processFullRecordSet', "[{$source}] Adding new records"); foreach ($records as $id => $record) { $this->storeRecord($id, false, $record); ++$added; } $this->log->log('processFullRecordSet', "[{$source}] {$added} new, {$changed} changed, {$unchanged} unchanged and " . "{$deleted} deleted records processed"); }
/** * Create Solr array for the given record * * @param array $record Mongo record * @param integer $mergedComponents Number of component parts merged to the * record * * @return string[] * @throws Exception */ protected function createSolrArray($record, &$mergedComponents) { global $configArray; $metadataRecord = RecordFactory::createRecord($record['format'], MetadataUtils::getRecordData($record, true), $record['oai_id'], $record['source_id']); $source = $record['source_id']; if (!isset($this->settings[$source])) { // Try to reload data source settings as they might have been updated // during a long run $this->loadDatasources(); if (!isset($this->settings[$source])) { $this->log->log('createSolrArray', "No settings found for data source '{$source}', record " . "{$record['_id']}: " . $this->prettyPrint($record, true), Logger::FATAL); throw new Exception("No settings found for data source '{$source}'"); } } $settings = $this->settings[$source]; $hiddenComponent = false; if (isset($record['host_record_id'])) { if ($settings['componentParts'] == 'merge_all') { $hiddenComponent = true; } elseif ($settings['componentParts'] == 'merge_non_articles' || $settings['componentParts'] == 'merge_non_earticles') { $format = $metadataRecord->getFormat(); if (!in_array($format, $this->allArticleFormats)) { $hiddenComponent = true; } elseif (in_array($format, $this->articleFormats)) { $hiddenComponent = true; } } } if ($hiddenComponent && !$settings['indexMergedParts']) { return false; } $hasComponentParts = false; $components = null; if (!isset($record['host_record_id'])) { // Fetch info whether component parts exist and need to be merged if (!$record['linking_id']) { $this->log->log('createSolrArray', "linking_id missing for record '{$record['_id']}'", Logger::ERROR); } else { $components = $this->db->record->find(['source_id' => $record['source_id'], 'host_record_id' => $record['linking_id'], 'deleted' => false])->timeout($this->cursorTimeout); $hasComponentParts = $components->hasNext(); $format = $metadataRecord->getFormat(); $merge = false; if ($settings['componentParts'] == 'merge_all') { $merge = true; } elseif (!in_array($format, $this->allJournalFormats)) { $merge = true; } elseif (in_array($format, $this->journalFormats) && $settings['componentParts'] == 'merge_non_earticles') { $merge = true; } if (!$merge) { unset($components); } } } if (isset($components)) { $mergedComponents += $metadataRecord->mergeComponentParts($components); } if (isset($settings['solrTransformationXSLT'])) { $params = ['source_id' => $source, 'institution' => $settings['institution'], 'format' => $settings['format'], 'id_prefix' => $settings['idPrefix']]; $data = $settings['solrTransformationXSLT']->transformToSolrArray($metadataRecord->toXML(), $params); } else { $prependTitleWithSubtitle = isset($settings['prepend_title_with_subtitle']) ? $settings['prepend_title_with_subtitle'] : true; $data = $metadataRecord->toSolrArray($prependTitleWithSubtitle); $this->enrich($source, $settings, $metadataRecord, $data); } $data['id'] = $record['_id']; // Record links between host records and component parts if ($metadataRecord->getIsComponentPart()) { $hostRecord = null; if (isset($record['host_record_id']) && $this->db) { $hostRecord = $this->db->record->find(['source_id' => $record['source_id'], 'linking_id' => $record['host_record_id']])->limit(-1)->timeout($this->cursorTimeout)->getNext(); } if (!$hostRecord) { if (isset($record['host_record_id'])) { $this->log->log('createSolrArray', "Host record '" . $record['host_record_id'] . "' not found for record '" . $record['_id'] . "'", Logger::WARNING); } $data['container_title'] = $metadataRecord->getContainerTitle(); } else { $data['hierarchy_parent_id'] = $hostRecord['_id']; $hostMetadataRecord = RecordFactory::createRecord($hostRecord['format'], MetadataUtils::getRecordData($hostRecord, true), $hostRecord['oai_id'], $hostRecord['source_id']); $data['container_title'] = $data['hierarchy_parent_title'] = $hostMetadataRecord->getTitle(); } $data['container_volume'] = $metadataRecord->getVolume(); $data['container_issue'] = $metadataRecord->getIssue(); $data['container_start_page'] = $metadataRecord->getStartPage(); $data['container_reference'] = $metadataRecord->getContainerReference(); } else { // Add prefixes to hierarchy linking fields foreach (['hierarchy_top_id', 'hierarchy_parent_id', 'is_hierarchy_id'] as $field) { if (isset($data[$field]) && $data[$field]) { $data[$field] = $record['source_id'] . '.' . $data[$field]; } } } if ($hasComponentParts) { $data['is_hierarchy_id'] = $record['_id']; $data['is_hierarchy_title'] = $metadataRecord->getTitle(); } if (!isset($data['institution'])) { $data['institution'] = $settings['institution']; } foreach ($settings['extraFields'] as $extraField) { $fieldName = key($extraField); $fieldValue = current($extraField); if (isset($data[$fieldName])) { if (!is_array($data[$fieldName])) { $data[$fieldName] = [$data[$fieldName]]; } $data[$fieldName][] = $fieldValue; } else { $data[$fieldName] = $fieldValue; } } // Map field values according to any mapping files foreach ($settings['mappingFiles'] as $field => $map) { if (isset($data[$field]) && !empty($data[$field])) { if (is_array($data[$field])) { $newValues = null; foreach ($data[$field] as $value) { if (isset($map[$value])) { $newValues = $map[$value]; } elseif (isset($map['##default'])) { $newValues = $map['##default']; } } if (null !== $newValues) { if (is_array($newValues)) { $data[$field] = array_values(array_unique($newValues)); } else { $data[$field] = $newValues; } } } else { if (isset($map[$data[$field]])) { $data[$field] = $map[$data[$field]]; } elseif (isset($map['##default'])) { $data[$field] = $map['##default']; } } } elseif (isset($map['##empty'])) { $data[$field] = $map['##empty']; } elseif (isset($map['##emptyarray'])) { $data[$field] = [$map['##emptyarray']]; } } // Special case: Special values for building (institution/location). // Used by default if building is set as a hierarchical facet. if ($this->buildingHierarchy || isset($settings['institutionInBuilding'])) { $useInstitution = isset($settings['institutionInBuilding']) ? $settings['institutionInBuilding'] : 'institution'; switch ($useInstitution) { case 'driver': $institutionCode = $data['institution']; break; case 'none': $institutionCode = ''; break; case 'source': $institutionCode = $source; break; case 'institution/source': $institutionCode = $settings['institution'] . '/' . $source; break; default: $institutionCode = $settings['institution']; break; } if ($institutionCode) { if (isset($data['building']) && $data['building']) { if (is_array($data['building'])) { foreach ($data['building'] as &$building) { // Allow also empty values that might result from // mapping tables if ($building !== '') { $building = "{$institutionCode}/{$building}"; } } } else { $data['building'] = $institutionCode . '/' . $data['building']; } } else { $data['building'] = [$institutionCode]; } } } // Hierarchical facets if (isset($configArray['Solr']['hierarchical_facets'])) { foreach ($configArray['Solr']['hierarchical_facets'] as $facet) { if (!isset($data[$facet])) { continue; } $array = []; if (!is_array($data[$facet])) { $data[$facet] = [$data[$facet]]; } foreach ($data[$facet] as $datavalue) { if ($datavalue === '') { continue; } $values = explode('/', $datavalue); $hierarchyString = ''; for ($i = 0; $i < count($values); $i++) { $hierarchyString .= '/' . $values[$i]; $array[] = $i . $hierarchyString . '/'; } } $data[$facet] = $array; } } if (!isset($data['allfields'])) { $all = []; foreach ($data as $key => $field) { if (in_array($key, ['fullrecord', 'thumbnail', 'id', 'recordtype', 'ctrlnum'])) { continue; } if (is_array($field)) { $all = array_merge($all, $field); } else { $all[] = $field; } } $data['allfields'] = MetadataUtils::array_iunique($all); } $data['first_indexed'] = MetadataUtils::formatTimestamp($record['created']->sec); $data['last_indexed'] = MetadataUtils::formatTimestamp($record['date']->sec); $data['recordtype'] = $record['format']; if (!isset($data['fullrecord'])) { $data['fullrecord'] = $metadataRecord->toXML(); } if (!is_array($data['format'])) { $data['format'] = [$data['format']]; } if (isset($configArray['Solr']['format_in_allfields']) && $configArray['Solr']['format_in_allfields']) { foreach ($data['format'] as $format) { // Replace numbers since they may be be considered word boundaries $data['allfields'][] = str_replace(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], ['ax', 'bx', 'cx', 'dx', 'ex', 'fx', 'gx', 'hx', 'ix', 'jx'], MetadataUtils::normalize($format)); } } if ($hiddenComponent) { $data['hidden_component_boolean'] = true; } foreach ($data as $key => &$values) { if (is_array($values)) { foreach ($values as $key => &$value) { $value = MetadataUtils::normalizeUnicode($value); if (empty($value) || $value === 0 || $value === 0.0 || $value === '0') { unset($values[$key]); } } $values = array_values(array_unique($values)); } elseif ($key != 'fullrecord') { $values = MetadataUtils::normalizeUnicode($values); } if (empty($values) || $values === 0 || $values === 0.0 || $values === '0') { unset($data[$key]); } } return $data; }
/** * Deduplicate component parts of a record * * Component part deduplication is special. It will only go through * component parts of other records deduplicated with the host record * and stops when it finds a set of component parts that match. * * @param array $hostRecord Mongo record for the host record * * @return integer Number of component parts deduplicated */ protected function dedupComponentParts($hostRecord) { if ($this->verbose) { echo "Deduplicating component parts\n"; } if (!$hostRecord['linking_id']) { $this->log->log('dedupComponentParts', 'Linking ID missing from record ' . $hostRecord['_id'], Logger::ERROR); return 0; } $components1 = $this->getComponentPartsSorted($hostRecord['source_id'], $hostRecord['linking_id']); $component1count = count($components1); // Go through all other records with same dedup id and see if their // component parts match $marked = 0; $otherRecords = $this->db->record->find(['dedup_id' => $hostRecord['dedup_id'], 'deleted' => false])->timeout($this->cursorTimeout); foreach ($otherRecords as $otherRecord) { if ($otherRecord['source_id'] == $hostRecord['source_id']) { continue; } $components2 = $this->getComponentPartsSorted($otherRecord['source_id'], $otherRecord['linking_id']); $component2count = count($components2); if ($component1count != $component2count) { $allMatch = false; } else { $allMatch = true; $idx = -1; foreach ($components1 as $component1) { $component2 = $components2[++$idx]; if ($this->verbose) { echo "Comparing {$component1['_id']} with " . "{$component2['_id']}\n"; } if ($this->verbose) { echo 'Original ' . $component1['_id'] . ":\n" . MetadataUtils::getRecordData($component1, true) . "\n"; } $metadataComponent1 = RecordFactory::createRecord($component1['format'], MetadataUtils::getRecordData($component1, true), $component1['oai_id'], $component1['source_id']); if (!$this->matchRecords($component1, $metadataComponent1, $component2)) { $allMatch = false; break; } } } if ($allMatch) { if ($this->verbose) { echo microtime(true) . " All component parts match between " . "{$hostRecord['_id']} and {$otherRecord['_id']}\n"; } $idx = -1; foreach ($components1 as $component1) { $component2 = $components2[++$idx]; $this->markDuplicates($component1, $component2); ++$marked; } break; } else { if ($this->verbose) { echo microtime(true) . " Not all component parts match between " . "{$hostRecord['_id']} and {$otherRecord['_id']}\n"; } } } return $marked; }
/** * Merge component parts to this record * * @param MongoCollection $componentParts Component parts to be merged * * @return int Count of records merged */ public function mergeComponentParts($componentParts) { $count = 0; $parts = []; foreach ($componentParts as $componentPart) { $data = MetadataUtils::getRecordData($componentPart, true); $marc = new MARCRecord($data, '', $this->source, $this->idPrefix); $title = $marc->getFieldSubfields('245', ['a' => 1, 'b' => 1, 'n' => 1, 'p' => 1]); $uniTitle = $marc->getFieldSubfields('240', ['a' => 1, 'n' => 1, 'p' => 1]); if (!$uniTitle) { $uniTitle = $marc->getFieldSubfields('130', ['a' => 1, 'n' => 1, 'p' => 1]); } $additionalTitles = $marc->getFieldsSubfields([[MarcRecord::GET_NORMAL, '740', ['a' => 1]]]); $authors = $marc->getFieldsSubfields([[MarcRecord::GET_NORMAL, '100', ['a' => 1, 'e' => 1]], [MarcRecord::GET_NORMAL, '110', ['a' => 1, 'e' => 1]]]); $additionalAuthors = $marc->getFieldsSubfields([[MarcRecord::GET_NORMAL, '700', ['a' => 1, 'e' => 1]], [MarcRecord::GET_NORMAL, '710', ['a' => 1, 'e' => 1]]]); $duration = $marc->getFieldsSubfields([[MarcRecord::GET_NORMAL, '306', ['a' => 1]]]); $languages = [substr($marc->getField('008'), 35, 3)]; $languages = array_unique(array_merge($languages, $marc->getFieldsSubfields([[MarcRecord::GET_NORMAL, '041', ['a' => 1]], [MarcRecord::GET_NORMAL, '041', ['d' => 1]]], false, true, true))); $originalLanguages = $marc->getFieldsSubfields([[MarcRecord::GET_NORMAL, '041', ['h' => 1]]], false, true, true); $subtitleLanguages = $marc->getFieldsSubfields([[MarcRecord::GET_NORMAL, '041', ['j' => 1]]], false, true, true); $id = $componentPart['_id']; $newField = ['i1' => ' ', 'i2' => ' ', 's' => [['a' => $id]]]; if ($title) { $newField['s'][] = ['b' => $title]; } if ($authors) { $newField['s'][] = ['c' => array_shift($authors)]; foreach ($authors as $author) { $newField['s'][] = ['d' => $author]; } } foreach ($additionalAuthors as $addAuthor) { $newField['s'][] = ['d' => $addAuthor]; } if ($uniTitle) { $newField['s'][] = ['e' => $uniTitle]; } if ($duration) { $newField['s'][] = ['f' => reset($duration)]; } foreach ($additionalTitles as $addTitle) { $newField['s'][] = ['g' => $addTitle]; } foreach ($languages as $language) { if (preg_match('/^\\w{3}$/', $language) && $language != 'zxx' && $language != 'und') { $newField['s'][] = ['h' => $language]; } } foreach ($originalLanguages as $language) { if (preg_match('/^\\w{3}$/', $language) && $language != 'zxx' && $language != 'und') { $newField['s'][] = ['i' => $language]; } } foreach ($subtitleLanguages as $language) { if (preg_match('/^\\w{3}$/', $language) && $language != 'zxx' && $language != 'und') { $newField['s'][] = ['j' => $language]; } } $key = MetadataUtils::createIdSortKey($id); $parts["{$key} {$count}"] = $newField; ++$count; } ksort($parts); $this->fields['979'] = array_values($parts); return $count; }