/** * Process a sample record * * @param string $sample Sample record file * * @return array SOLR record array */ protected function processSample($sample) { $actualdir = dirname(__FILE__); $sample = file_get_contents($actualdir . "/../samples/" . $sample); $record = RecordFactory::createRecord($this->driver, $sample, "__unit_test_no_id__", "__unit_test_no_source__"); return $record->toSolrArray(); }
/** * Creates a preview of the given metadata and returns it * * @param string $metadata The metadata to process * @param string $format Metadata format * @param string $source Source identifier * * @return array Solr record fields */ public function preview($metadata, $format, $source) { if (!$source) { $source = "_preview"; } /* Process data source preTransformation XSL if present TODO: duplicates code from RecordManager, refactor? */ $settings = $this->settings[$source]; if (isset($settings['preTransformation']) && $settings['preTransformation']) { $style = new DOMDocument(); $style->load($this->basePath . '/transformations/' . $settings['preTransformation']); $xslt = new XSLTProcessor(); $xslt->importStylesheet($style); $xslt->setParameter('', 'source_id', $source); $xslt->setParameter('', 'institution', $settings['institution']); $xslt->setParameter('', 'format', $format); $xslt->setParameter('', 'id_prefix', isset($settings['idPrefix']) && $settings['idPrefix'] ? $settings['idPrefix'] : $source); $doc = new DOMDocument(); $doc->loadXML($metadata); $metadata = $xslt->transformToXml($doc); } $record = ['format' => $format, 'original_data' => $metadata, 'normalized_data' => $metadata, 'source_id' => $source, 'linking_id' => '', 'oai_id' => '_preview', '_id' => '_preview', 'created' => new MongoDate(), 'date' => new MongoDate()]; // Normalize the record $this->normalizationXSLT = isset($settings['normalization']) && $settings['normalization'] ?: null; if (isset($settings['normalization'])) { $basePath = substr(__FILE__, 0, strrpos(__FILE__, DIRECTORY_SEPARATOR)); $basePath = substr($basePath, 0, strrpos($basePath, DIRECTORY_SEPARATOR)); $params = ['source_id' => $source, 'institution' => 'Preview', 'format' => $format, 'id_prefix' => '']; $normalizationXSLT = new XslTransformation($basePath . '/transformations', $settings['normalization'], $params); $record['normalized_data'] = $normalizationXSLT->transform($metadata, ['oai_id' => $record['oai_id']]); } $metadataRecord = RecordFactory::createRecord($record['format'], $record['normalized_data'], $record['oai_id'], $record['source_id']); $metadataRecord->normalize(); $record['normalized_data'] = $metadataRecord->serialize(); $record['_id'] = $source . '.' . $metadataRecord->getID(); return $this->createSolrArray($record, $componentParts); }
/** * Process a complete record set harvested e.g. from MetaLib * * @param string $source Source ID * @param string[] $harvestedRecords Array of records * * @return void */ protected function processFullRecordSet($source, $harvestedRecords) { $this->log->log('processFullRecordSet', "[{$source}] Processing complete record set"); // Create keyed array $records = []; foreach ($harvestedRecords as $record) { $marc = RecordFactory::createRecord('marc', $record, '', $source); $id = $marc->getID(); $records["{$source}.{$id}"] = $record; } $this->log->log('processFullRecordSet', "[{$source}] Merging results with the records in database"); $deleted = 0; $unchanged = 0; $changed = 0; $added = 0; $dbRecords = $this->db->record->find(['deleted' => false, 'source_id' => $source])->timeout($this->cursorTimeout); foreach ($dbRecords as $dbRecord) { $id = $dbRecord['_id']; if (!isset($records[$id])) { // Record not in harvested records, mark deleted $this->storeRecord($id, true, ''); unset($records[$id]); ++$deleted; continue; } // Check if the record has changed $marc = RecordFactory::createRecord('marc', $records[$id], '', $source); if ($marc->serialize() != MetadataUtils::getRecordData($dbRecord, false)) { // Record changed, update... $this->storeRecord($id, false, $records[$id]); ++$changed; } else { ++$unchanged; } unset($records[$id]); } $this->log->log('processFullRecordSet', "[{$source}] Adding new records"); foreach ($records as $id => $record) { $this->storeRecord($id, false, $record); ++$added; } $this->log->log('processFullRecordSet', "[{$source}] {$added} new, {$changed} changed, {$unchanged} unchanged and " . "{$deleted} deleted records processed"); }
/** * Create record XML * * @param array $record Mongo record * @param string $format Metadata format * @param boolean $includeMetadata Whether to include record data * (or only header) * * @return boolean|string */ protected function createRecord($record, $format, $includeMetadata) { global $basePath; $sourceFormat = $record['format']; if (isset($this->formats[$format])) { $format = $this->formats[$format]['format']; } $metadata = ''; if ($includeMetadata) { $mongodata = $record['normalized_data'] ? $record['normalized_data'] : $record['original_data']; $metadataRecord = RecordFactory::createRecord($record['format'], gzinflate($mongodata->bin), $record['oai_id'], $record['source_id']); $metadata = $metadataRecord->toXML(); $key = "transformation_to_{$format}"; $source = $record['source_id']; $datasource = $this->dataSourceSettings[$source]; if ($sourceFormat != $format || isset($datasource[$key])) { if (!isset($datasource[$key])) { $this->error('cannotDisseminateFormat', ''); return false; } $transformationKey = "{$key}_{$source}"; if (!isset($this->transformations[$transformationKey])) { $this->transformations[$transformationKey] = new XslTransformation($basePath . '/transformations', $datasource[$key]); } $params = ['source_id' => $source, 'institution' => $datasource['institution'], 'format' => $record['format']]; $metadata = $this->transformations[$transformationKey]->transform($metadata, $params); } if (strncmp($metadata, '<?xml', 5) == 0) { $end = strpos($metadata, '>'); $metadata = substr($metadata, $end + 1); } $metadata = <<<EOF <metadata> {$metadata} </metadata> EOF; } $setSpecs = ''; foreach ($this->getRecordSets($record) as $id) { $id = $this->escape($id); $setSpecs .= <<<EOF <setSpec>{$id}</setSpec> EOF; } $id = $this->escape($record['oai_id']); $date = $this->toOaiDate($record['updated']->sec); $status = $record['deleted'] ? ' status="deleted"' : ''; return <<<EOF <record> <header{$status}> <identifier>{$id}</identifier> <datestamp>{$date}</datestamp> {$setSpecs} </header> {$metadata} </record> EOF; }
/** * Deduplicate component parts of a record * * Component part deduplication is special. It will only go through * component parts of other records deduplicated with the host record * and stops when it finds a set of component parts that match. * * @param array $hostRecord Mongo record for the host record * * @return integer Number of component parts deduplicated */ protected function dedupComponentParts($hostRecord) { if ($this->verbose) { echo "Deduplicating component parts\n"; } if (!$hostRecord['linking_id']) { $this->log->log('dedupComponentParts', 'Linking ID missing from record ' . $hostRecord['_id'], Logger::ERROR); return 0; } $components1 = $this->getComponentPartsSorted($hostRecord['source_id'], $hostRecord['linking_id']); $component1count = count($components1); // Go through all other records with same dedup id and see if their // component parts match $marked = 0; $otherRecords = $this->db->record->find(['dedup_id' => $hostRecord['dedup_id'], 'deleted' => false])->timeout($this->cursorTimeout); foreach ($otherRecords as $otherRecord) { if ($otherRecord['source_id'] == $hostRecord['source_id']) { continue; } $components2 = $this->getComponentPartsSorted($otherRecord['source_id'], $otherRecord['linking_id']); $component2count = count($components2); if ($component1count != $component2count) { $allMatch = false; } else { $allMatch = true; $idx = -1; foreach ($components1 as $component1) { $component2 = $components2[++$idx]; if ($this->verbose) { echo "Comparing {$component1['_id']} with " . "{$component2['_id']}\n"; } if ($this->verbose) { echo 'Original ' . $component1['_id'] . ":\n" . MetadataUtils::getRecordData($component1, true) . "\n"; } $metadataComponent1 = RecordFactory::createRecord($component1['format'], MetadataUtils::getRecordData($component1, true), $component1['oai_id'], $component1['source_id']); if (!$this->matchRecords($component1, $metadataComponent1, $component2)) { $allMatch = false; break; } } } if ($allMatch) { if ($this->verbose) { echo microtime(true) . " All component parts match between " . "{$hostRecord['_id']} and {$otherRecord['_id']}\n"; } $idx = -1; foreach ($components1 as $component1) { $component2 = $components2[++$idx]; $this->markDuplicates($component1, $component2); ++$marked; } break; } else { if ($this->verbose) { echo microtime(true) . " Not all component parts match between " . "{$hostRecord['_id']} and {$otherRecord['_id']}\n"; } } } return $marked; }
/** * Create Solr array for the given record * * @param array $record Mongo record * @param integer $mergedComponents Number of component parts merged to the * record * * @return string[] * @throws Exception */ protected function createSolrArray($record, &$mergedComponents) { global $configArray; $metadataRecord = RecordFactory::createRecord($record['format'], MetadataUtils::getRecordData($record, true), $record['oai_id'], $record['source_id']); $source = $record['source_id']; if (!isset($this->settings[$source])) { // Try to reload data source settings as they might have been updated // during a long run $this->loadDatasources(); if (!isset($this->settings[$source])) { $this->log->log('createSolrArray', "No settings found for data source '{$source}', record " . "{$record['_id']}: " . $this->prettyPrint($record, true), Logger::FATAL); throw new Exception("No settings found for data source '{$source}'"); } } $settings = $this->settings[$source]; $hiddenComponent = false; if (isset($record['host_record_id'])) { if ($settings['componentParts'] == 'merge_all') { $hiddenComponent = true; } elseif ($settings['componentParts'] == 'merge_non_articles' || $settings['componentParts'] == 'merge_non_earticles') { $format = $metadataRecord->getFormat(); if (!in_array($format, $this->allArticleFormats)) { $hiddenComponent = true; } elseif (in_array($format, $this->articleFormats)) { $hiddenComponent = true; } } } if ($hiddenComponent && !$settings['indexMergedParts']) { return false; } $hasComponentParts = false; $components = null; if (!isset($record['host_record_id'])) { // Fetch info whether component parts exist and need to be merged if (!$record['linking_id']) { $this->log->log('createSolrArray', "linking_id missing for record '{$record['_id']}'", Logger::ERROR); } else { $components = $this->db->record->find(['source_id' => $record['source_id'], 'host_record_id' => $record['linking_id'], 'deleted' => false])->timeout($this->cursorTimeout); $hasComponentParts = $components->hasNext(); $format = $metadataRecord->getFormat(); $merge = false; if ($settings['componentParts'] == 'merge_all') { $merge = true; } elseif (!in_array($format, $this->allJournalFormats)) { $merge = true; } elseif (in_array($format, $this->journalFormats) && $settings['componentParts'] == 'merge_non_earticles') { $merge = true; } if (!$merge) { unset($components); } } } if (isset($components)) { $mergedComponents += $metadataRecord->mergeComponentParts($components); } if (isset($settings['solrTransformationXSLT'])) { $params = ['source_id' => $source, 'institution' => $settings['institution'], 'format' => $settings['format'], 'id_prefix' => $settings['idPrefix']]; $data = $settings['solrTransformationXSLT']->transformToSolrArray($metadataRecord->toXML(), $params); } else { $prependTitleWithSubtitle = isset($settings['prepend_title_with_subtitle']) ? $settings['prepend_title_with_subtitle'] : true; $data = $metadataRecord->toSolrArray($prependTitleWithSubtitle); $this->enrich($source, $settings, $metadataRecord, $data); } $data['id'] = $record['_id']; // Record links between host records and component parts if ($metadataRecord->getIsComponentPart()) { $hostRecord = null; if (isset($record['host_record_id']) && $this->db) { $hostRecord = $this->db->record->find(['source_id' => $record['source_id'], 'linking_id' => $record['host_record_id']])->limit(-1)->timeout($this->cursorTimeout)->getNext(); } if (!$hostRecord) { if (isset($record['host_record_id'])) { $this->log->log('createSolrArray', "Host record '" . $record['host_record_id'] . "' not found for record '" . $record['_id'] . "'", Logger::WARNING); } $data['container_title'] = $metadataRecord->getContainerTitle(); } else { $data['hierarchy_parent_id'] = $hostRecord['_id']; $hostMetadataRecord = RecordFactory::createRecord($hostRecord['format'], MetadataUtils::getRecordData($hostRecord, true), $hostRecord['oai_id'], $hostRecord['source_id']); $data['container_title'] = $data['hierarchy_parent_title'] = $hostMetadataRecord->getTitle(); } $data['container_volume'] = $metadataRecord->getVolume(); $data['container_issue'] = $metadataRecord->getIssue(); $data['container_start_page'] = $metadataRecord->getStartPage(); $data['container_reference'] = $metadataRecord->getContainerReference(); } else { // Add prefixes to hierarchy linking fields foreach (['hierarchy_top_id', 'hierarchy_parent_id', 'is_hierarchy_id'] as $field) { if (isset($data[$field]) && $data[$field]) { $data[$field] = $record['source_id'] . '.' . $data[$field]; } } } if ($hasComponentParts) { $data['is_hierarchy_id'] = $record['_id']; $data['is_hierarchy_title'] = $metadataRecord->getTitle(); } if (!isset($data['institution'])) { $data['institution'] = $settings['institution']; } foreach ($settings['extraFields'] as $extraField) { $fieldName = key($extraField); $fieldValue = current($extraField); if (isset($data[$fieldName])) { if (!is_array($data[$fieldName])) { $data[$fieldName] = [$data[$fieldName]]; } $data[$fieldName][] = $fieldValue; } else { $data[$fieldName] = $fieldValue; } } // Map field values according to any mapping files foreach ($settings['mappingFiles'] as $field => $map) { if (isset($data[$field]) && !empty($data[$field])) { if (is_array($data[$field])) { $newValues = null; foreach ($data[$field] as $value) { if (isset($map[$value])) { $newValues = $map[$value]; } elseif (isset($map['##default'])) { $newValues = $map['##default']; } } if (null !== $newValues) { if (is_array($newValues)) { $data[$field] = array_values(array_unique($newValues)); } else { $data[$field] = $newValues; } } } else { if (isset($map[$data[$field]])) { $data[$field] = $map[$data[$field]]; } elseif (isset($map['##default'])) { $data[$field] = $map['##default']; } } } elseif (isset($map['##empty'])) { $data[$field] = $map['##empty']; } elseif (isset($map['##emptyarray'])) { $data[$field] = [$map['##emptyarray']]; } } // Special case: Special values for building (institution/location). // Used by default if building is set as a hierarchical facet. if ($this->buildingHierarchy || isset($settings['institutionInBuilding'])) { $useInstitution = isset($settings['institutionInBuilding']) ? $settings['institutionInBuilding'] : 'institution'; switch ($useInstitution) { case 'driver': $institutionCode = $data['institution']; break; case 'none': $institutionCode = ''; break; case 'source': $institutionCode = $source; break; case 'institution/source': $institutionCode = $settings['institution'] . '/' . $source; break; default: $institutionCode = $settings['institution']; break; } if ($institutionCode) { if (isset($data['building']) && $data['building']) { if (is_array($data['building'])) { foreach ($data['building'] as &$building) { // Allow also empty values that might result from // mapping tables if ($building !== '') { $building = "{$institutionCode}/{$building}"; } } } else { $data['building'] = $institutionCode . '/' . $data['building']; } } else { $data['building'] = [$institutionCode]; } } } // Hierarchical facets if (isset($configArray['Solr']['hierarchical_facets'])) { foreach ($configArray['Solr']['hierarchical_facets'] as $facet) { if (!isset($data[$facet])) { continue; } $array = []; if (!is_array($data[$facet])) { $data[$facet] = [$data[$facet]]; } foreach ($data[$facet] as $datavalue) { if ($datavalue === '') { continue; } $values = explode('/', $datavalue); $hierarchyString = ''; for ($i = 0; $i < count($values); $i++) { $hierarchyString .= '/' . $values[$i]; $array[] = $i . $hierarchyString . '/'; } } $data[$facet] = $array; } } if (!isset($data['allfields'])) { $all = []; foreach ($data as $key => $field) { if (in_array($key, ['fullrecord', 'thumbnail', 'id', 'recordtype', 'ctrlnum'])) { continue; } if (is_array($field)) { $all = array_merge($all, $field); } else { $all[] = $field; } } $data['allfields'] = MetadataUtils::array_iunique($all); } $data['first_indexed'] = MetadataUtils::formatTimestamp($record['created']->sec); $data['last_indexed'] = MetadataUtils::formatTimestamp($record['date']->sec); $data['recordtype'] = $record['format']; if (!isset($data['fullrecord'])) { $data['fullrecord'] = $metadataRecord->toXML(); } if (!is_array($data['format'])) { $data['format'] = [$data['format']]; } if (isset($configArray['Solr']['format_in_allfields']) && $configArray['Solr']['format_in_allfields']) { foreach ($data['format'] as $format) { // Replace numbers since they may be be considered word boundaries $data['allfields'][] = str_replace(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], ['ax', 'bx', 'cx', 'dx', 'ex', 'fx', 'gx', 'hx', 'ix', 'jx'], MetadataUtils::normalize($format)); } } if ($hiddenComponent) { $data['hidden_component_boolean'] = true; } foreach ($data as $key => &$values) { if (is_array($values)) { foreach ($values as $key => &$value) { $value = MetadataUtils::normalizeUnicode($value); if (empty($value) || $value === 0 || $value === 0.0 || $value === '0') { unset($values[$key]); } } $values = array_values(array_unique($values)); } elseif ($key != 'fullrecord') { $values = MetadataUtils::normalizeUnicode($values); } if (empty($values) || $values === 0 || $values === 0.0 || $values === '0') { unset($data[$key]); } } return $data; }