Ejemplo n.º 1
0
 /**
  * Process a complete record set harvested e.g. from MetaLib
  *
  * @param string   $source           Source ID
  * @param string[] $harvestedRecords Array of records
  *
  * @return void
  */
 protected function processFullRecordSet($source, $harvestedRecords)
 {
     $this->log->log('processFullRecordSet', "[{$source}] Processing complete record set");
     // Create keyed array
     $records = [];
     foreach ($harvestedRecords as $record) {
         $marc = RecordFactory::createRecord('marc', $record, '', $source);
         $id = $marc->getID();
         $records["{$source}.{$id}"] = $record;
     }
     $this->log->log('processFullRecordSet', "[{$source}] Merging results with the records in database");
     $deleted = 0;
     $unchanged = 0;
     $changed = 0;
     $added = 0;
     $dbRecords = $this->db->record->find(['deleted' => false, 'source_id' => $source])->timeout($this->cursorTimeout);
     foreach ($dbRecords as $dbRecord) {
         $id = $dbRecord['_id'];
         if (!isset($records[$id])) {
             // Record not in harvested records, mark deleted
             $this->storeRecord($id, true, '');
             unset($records[$id]);
             ++$deleted;
             continue;
         }
         // Check if the record has changed
         $marc = RecordFactory::createRecord('marc', $records[$id], '', $source);
         if ($marc->serialize() != MetadataUtils::getRecordData($dbRecord, false)) {
             // Record changed, update...
             $this->storeRecord($id, false, $records[$id]);
             ++$changed;
         } else {
             ++$unchanged;
         }
         unset($records[$id]);
     }
     $this->log->log('processFullRecordSet', "[{$source}] Adding new records");
     foreach ($records as $id => $record) {
         $this->storeRecord($id, false, $record);
         ++$added;
     }
     $this->log->log('processFullRecordSet', "[{$source}] {$added} new, {$changed} changed, {$unchanged} unchanged and " . "{$deleted} deleted records processed");
 }
Ejemplo n.º 2
0
 /**
  * Create Solr array for the given record
  *
  * @param array   $record           Mongo record
  * @param integer $mergedComponents Number of component parts merged to the
  * record
  *
  * @return string[]
  * @throws Exception
  */
 protected function createSolrArray($record, &$mergedComponents)
 {
     global $configArray;
     $metadataRecord = RecordFactory::createRecord($record['format'], MetadataUtils::getRecordData($record, true), $record['oai_id'], $record['source_id']);
     $source = $record['source_id'];
     if (!isset($this->settings[$source])) {
         // Try to reload data source settings as they might have been updated
         // during a long run
         $this->loadDatasources();
         if (!isset($this->settings[$source])) {
             $this->log->log('createSolrArray', "No settings found for data source '{$source}', record " . "{$record['_id']}: " . $this->prettyPrint($record, true), Logger::FATAL);
             throw new Exception("No settings found for data source '{$source}'");
         }
     }
     $settings = $this->settings[$source];
     $hiddenComponent = false;
     if (isset($record['host_record_id'])) {
         if ($settings['componentParts'] == 'merge_all') {
             $hiddenComponent = true;
         } elseif ($settings['componentParts'] == 'merge_non_articles' || $settings['componentParts'] == 'merge_non_earticles') {
             $format = $metadataRecord->getFormat();
             if (!in_array($format, $this->allArticleFormats)) {
                 $hiddenComponent = true;
             } elseif (in_array($format, $this->articleFormats)) {
                 $hiddenComponent = true;
             }
         }
     }
     if ($hiddenComponent && !$settings['indexMergedParts']) {
         return false;
     }
     $hasComponentParts = false;
     $components = null;
     if (!isset($record['host_record_id'])) {
         // Fetch info whether component parts exist and need to be merged
         if (!$record['linking_id']) {
             $this->log->log('createSolrArray', "linking_id missing for record '{$record['_id']}'", Logger::ERROR);
         } else {
             $components = $this->db->record->find(['source_id' => $record['source_id'], 'host_record_id' => $record['linking_id'], 'deleted' => false])->timeout($this->cursorTimeout);
             $hasComponentParts = $components->hasNext();
             $format = $metadataRecord->getFormat();
             $merge = false;
             if ($settings['componentParts'] == 'merge_all') {
                 $merge = true;
             } elseif (!in_array($format, $this->allJournalFormats)) {
                 $merge = true;
             } elseif (in_array($format, $this->journalFormats) && $settings['componentParts'] == 'merge_non_earticles') {
                 $merge = true;
             }
             if (!$merge) {
                 unset($components);
             }
         }
     }
     if (isset($components)) {
         $mergedComponents += $metadataRecord->mergeComponentParts($components);
     }
     if (isset($settings['solrTransformationXSLT'])) {
         $params = ['source_id' => $source, 'institution' => $settings['institution'], 'format' => $settings['format'], 'id_prefix' => $settings['idPrefix']];
         $data = $settings['solrTransformationXSLT']->transformToSolrArray($metadataRecord->toXML(), $params);
     } else {
         $prependTitleWithSubtitle = isset($settings['prepend_title_with_subtitle']) ? $settings['prepend_title_with_subtitle'] : true;
         $data = $metadataRecord->toSolrArray($prependTitleWithSubtitle);
         $this->enrich($source, $settings, $metadataRecord, $data);
     }
     $data['id'] = $record['_id'];
     // Record links between host records and component parts
     if ($metadataRecord->getIsComponentPart()) {
         $hostRecord = null;
         if (isset($record['host_record_id']) && $this->db) {
             $hostRecord = $this->db->record->find(['source_id' => $record['source_id'], 'linking_id' => $record['host_record_id']])->limit(-1)->timeout($this->cursorTimeout)->getNext();
         }
         if (!$hostRecord) {
             if (isset($record['host_record_id'])) {
                 $this->log->log('createSolrArray', "Host record '" . $record['host_record_id'] . "' not found for record '" . $record['_id'] . "'", Logger::WARNING);
             }
             $data['container_title'] = $metadataRecord->getContainerTitle();
         } else {
             $data['hierarchy_parent_id'] = $hostRecord['_id'];
             $hostMetadataRecord = RecordFactory::createRecord($hostRecord['format'], MetadataUtils::getRecordData($hostRecord, true), $hostRecord['oai_id'], $hostRecord['source_id']);
             $data['container_title'] = $data['hierarchy_parent_title'] = $hostMetadataRecord->getTitle();
         }
         $data['container_volume'] = $metadataRecord->getVolume();
         $data['container_issue'] = $metadataRecord->getIssue();
         $data['container_start_page'] = $metadataRecord->getStartPage();
         $data['container_reference'] = $metadataRecord->getContainerReference();
     } else {
         // Add prefixes to hierarchy linking fields
         foreach (['hierarchy_top_id', 'hierarchy_parent_id', 'is_hierarchy_id'] as $field) {
             if (isset($data[$field]) && $data[$field]) {
                 $data[$field] = $record['source_id'] . '.' . $data[$field];
             }
         }
     }
     if ($hasComponentParts) {
         $data['is_hierarchy_id'] = $record['_id'];
         $data['is_hierarchy_title'] = $metadataRecord->getTitle();
     }
     if (!isset($data['institution'])) {
         $data['institution'] = $settings['institution'];
     }
     foreach ($settings['extraFields'] as $extraField) {
         $fieldName = key($extraField);
         $fieldValue = current($extraField);
         if (isset($data[$fieldName])) {
             if (!is_array($data[$fieldName])) {
                 $data[$fieldName] = [$data[$fieldName]];
             }
             $data[$fieldName][] = $fieldValue;
         } else {
             $data[$fieldName] = $fieldValue;
         }
     }
     // Map field values according to any mapping files
     foreach ($settings['mappingFiles'] as $field => $map) {
         if (isset($data[$field]) && !empty($data[$field])) {
             if (is_array($data[$field])) {
                 $newValues = null;
                 foreach ($data[$field] as $value) {
                     if (isset($map[$value])) {
                         $newValues = $map[$value];
                     } elseif (isset($map['##default'])) {
                         $newValues = $map['##default'];
                     }
                 }
                 if (null !== $newValues) {
                     if (is_array($newValues)) {
                         $data[$field] = array_values(array_unique($newValues));
                     } else {
                         $data[$field] = $newValues;
                     }
                 }
             } else {
                 if (isset($map[$data[$field]])) {
                     $data[$field] = $map[$data[$field]];
                 } elseif (isset($map['##default'])) {
                     $data[$field] = $map['##default'];
                 }
             }
         } elseif (isset($map['##empty'])) {
             $data[$field] = $map['##empty'];
         } elseif (isset($map['##emptyarray'])) {
             $data[$field] = [$map['##emptyarray']];
         }
     }
     // Special case: Special values for building (institution/location).
     // Used by default if building is set as a hierarchical facet.
     if ($this->buildingHierarchy || isset($settings['institutionInBuilding'])) {
         $useInstitution = isset($settings['institutionInBuilding']) ? $settings['institutionInBuilding'] : 'institution';
         switch ($useInstitution) {
             case 'driver':
                 $institutionCode = $data['institution'];
                 break;
             case 'none':
                 $institutionCode = '';
                 break;
             case 'source':
                 $institutionCode = $source;
                 break;
             case 'institution/source':
                 $institutionCode = $settings['institution'] . '/' . $source;
                 break;
             default:
                 $institutionCode = $settings['institution'];
                 break;
         }
         if ($institutionCode) {
             if (isset($data['building']) && $data['building']) {
                 if (is_array($data['building'])) {
                     foreach ($data['building'] as &$building) {
                         // Allow also empty values that might result from
                         // mapping tables
                         if ($building !== '') {
                             $building = "{$institutionCode}/{$building}";
                         }
                     }
                 } else {
                     $data['building'] = $institutionCode . '/' . $data['building'];
                 }
             } else {
                 $data['building'] = [$institutionCode];
             }
         }
     }
     // Hierarchical facets
     if (isset($configArray['Solr']['hierarchical_facets'])) {
         foreach ($configArray['Solr']['hierarchical_facets'] as $facet) {
             if (!isset($data[$facet])) {
                 continue;
             }
             $array = [];
             if (!is_array($data[$facet])) {
                 $data[$facet] = [$data[$facet]];
             }
             foreach ($data[$facet] as $datavalue) {
                 if ($datavalue === '') {
                     continue;
                 }
                 $values = explode('/', $datavalue);
                 $hierarchyString = '';
                 for ($i = 0; $i < count($values); $i++) {
                     $hierarchyString .= '/' . $values[$i];
                     $array[] = $i . $hierarchyString . '/';
                 }
             }
             $data[$facet] = $array;
         }
     }
     if (!isset($data['allfields'])) {
         $all = [];
         foreach ($data as $key => $field) {
             if (in_array($key, ['fullrecord', 'thumbnail', 'id', 'recordtype', 'ctrlnum'])) {
                 continue;
             }
             if (is_array($field)) {
                 $all = array_merge($all, $field);
             } else {
                 $all[] = $field;
             }
         }
         $data['allfields'] = MetadataUtils::array_iunique($all);
     }
     $data['first_indexed'] = MetadataUtils::formatTimestamp($record['created']->sec);
     $data['last_indexed'] = MetadataUtils::formatTimestamp($record['date']->sec);
     $data['recordtype'] = $record['format'];
     if (!isset($data['fullrecord'])) {
         $data['fullrecord'] = $metadataRecord->toXML();
     }
     if (!is_array($data['format'])) {
         $data['format'] = [$data['format']];
     }
     if (isset($configArray['Solr']['format_in_allfields']) && $configArray['Solr']['format_in_allfields']) {
         foreach ($data['format'] as $format) {
             // Replace numbers since they may be be considered word boundaries
             $data['allfields'][] = str_replace(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], ['ax', 'bx', 'cx', 'dx', 'ex', 'fx', 'gx', 'hx', 'ix', 'jx'], MetadataUtils::normalize($format));
         }
     }
     if ($hiddenComponent) {
         $data['hidden_component_boolean'] = true;
     }
     foreach ($data as $key => &$values) {
         if (is_array($values)) {
             foreach ($values as $key => &$value) {
                 $value = MetadataUtils::normalizeUnicode($value);
                 if (empty($value) || $value === 0 || $value === 0.0 || $value === '0') {
                     unset($values[$key]);
                 }
             }
             $values = array_values(array_unique($values));
         } elseif ($key != 'fullrecord') {
             $values = MetadataUtils::normalizeUnicode($values);
         }
         if (empty($values) || $values === 0 || $values === 0.0 || $values === '0') {
             unset($data[$key]);
         }
     }
     return $data;
 }
Ejemplo n.º 3
0
 /**
  * Deduplicate component parts of a record
  *
  * Component part deduplication is special. It will only go through
  * component parts of other records deduplicated with the host record
  * and stops when it finds a set of component parts that match.
  *
  * @param array $hostRecord Mongo record for the host record
  *
  * @return integer Number of component parts deduplicated
  */
 protected function dedupComponentParts($hostRecord)
 {
     if ($this->verbose) {
         echo "Deduplicating component parts\n";
     }
     if (!$hostRecord['linking_id']) {
         $this->log->log('dedupComponentParts', 'Linking ID missing from record ' . $hostRecord['_id'], Logger::ERROR);
         return 0;
     }
     $components1 = $this->getComponentPartsSorted($hostRecord['source_id'], $hostRecord['linking_id']);
     $component1count = count($components1);
     // Go through all other records with same dedup id and see if their
     // component parts match
     $marked = 0;
     $otherRecords = $this->db->record->find(['dedup_id' => $hostRecord['dedup_id'], 'deleted' => false])->timeout($this->cursorTimeout);
     foreach ($otherRecords as $otherRecord) {
         if ($otherRecord['source_id'] == $hostRecord['source_id']) {
             continue;
         }
         $components2 = $this->getComponentPartsSorted($otherRecord['source_id'], $otherRecord['linking_id']);
         $component2count = count($components2);
         if ($component1count != $component2count) {
             $allMatch = false;
         } else {
             $allMatch = true;
             $idx = -1;
             foreach ($components1 as $component1) {
                 $component2 = $components2[++$idx];
                 if ($this->verbose) {
                     echo "Comparing {$component1['_id']} with " . "{$component2['_id']}\n";
                 }
                 if ($this->verbose) {
                     echo 'Original ' . $component1['_id'] . ":\n" . MetadataUtils::getRecordData($component1, true) . "\n";
                 }
                 $metadataComponent1 = RecordFactory::createRecord($component1['format'], MetadataUtils::getRecordData($component1, true), $component1['oai_id'], $component1['source_id']);
                 if (!$this->matchRecords($component1, $metadataComponent1, $component2)) {
                     $allMatch = false;
                     break;
                 }
             }
         }
         if ($allMatch) {
             if ($this->verbose) {
                 echo microtime(true) . " All component parts match between " . "{$hostRecord['_id']} and {$otherRecord['_id']}\n";
             }
             $idx = -1;
             foreach ($components1 as $component1) {
                 $component2 = $components2[++$idx];
                 $this->markDuplicates($component1, $component2);
                 ++$marked;
             }
             break;
         } else {
             if ($this->verbose) {
                 echo microtime(true) . " Not all component parts match between " . "{$hostRecord['_id']} and {$otherRecord['_id']}\n";
             }
         }
     }
     return $marked;
 }
Ejemplo n.º 4
0
 /**
  * Merge component parts to this record
  *
  * @param MongoCollection $componentParts Component parts to be merged
  *
  * @return int Count of records merged
  */
 public function mergeComponentParts($componentParts)
 {
     $count = 0;
     $parts = [];
     foreach ($componentParts as $componentPart) {
         $data = MetadataUtils::getRecordData($componentPart, true);
         $marc = new MARCRecord($data, '', $this->source, $this->idPrefix);
         $title = $marc->getFieldSubfields('245', ['a' => 1, 'b' => 1, 'n' => 1, 'p' => 1]);
         $uniTitle = $marc->getFieldSubfields('240', ['a' => 1, 'n' => 1, 'p' => 1]);
         if (!$uniTitle) {
             $uniTitle = $marc->getFieldSubfields('130', ['a' => 1, 'n' => 1, 'p' => 1]);
         }
         $additionalTitles = $marc->getFieldsSubfields([[MarcRecord::GET_NORMAL, '740', ['a' => 1]]]);
         $authors = $marc->getFieldsSubfields([[MarcRecord::GET_NORMAL, '100', ['a' => 1, 'e' => 1]], [MarcRecord::GET_NORMAL, '110', ['a' => 1, 'e' => 1]]]);
         $additionalAuthors = $marc->getFieldsSubfields([[MarcRecord::GET_NORMAL, '700', ['a' => 1, 'e' => 1]], [MarcRecord::GET_NORMAL, '710', ['a' => 1, 'e' => 1]]]);
         $duration = $marc->getFieldsSubfields([[MarcRecord::GET_NORMAL, '306', ['a' => 1]]]);
         $languages = [substr($marc->getField('008'), 35, 3)];
         $languages = array_unique(array_merge($languages, $marc->getFieldsSubfields([[MarcRecord::GET_NORMAL, '041', ['a' => 1]], [MarcRecord::GET_NORMAL, '041', ['d' => 1]]], false, true, true)));
         $originalLanguages = $marc->getFieldsSubfields([[MarcRecord::GET_NORMAL, '041', ['h' => 1]]], false, true, true);
         $subtitleLanguages = $marc->getFieldsSubfields([[MarcRecord::GET_NORMAL, '041', ['j' => 1]]], false, true, true);
         $id = $componentPart['_id'];
         $newField = ['i1' => ' ', 'i2' => ' ', 's' => [['a' => $id]]];
         if ($title) {
             $newField['s'][] = ['b' => $title];
         }
         if ($authors) {
             $newField['s'][] = ['c' => array_shift($authors)];
             foreach ($authors as $author) {
                 $newField['s'][] = ['d' => $author];
             }
         }
         foreach ($additionalAuthors as $addAuthor) {
             $newField['s'][] = ['d' => $addAuthor];
         }
         if ($uniTitle) {
             $newField['s'][] = ['e' => $uniTitle];
         }
         if ($duration) {
             $newField['s'][] = ['f' => reset($duration)];
         }
         foreach ($additionalTitles as $addTitle) {
             $newField['s'][] = ['g' => $addTitle];
         }
         foreach ($languages as $language) {
             if (preg_match('/^\\w{3}$/', $language) && $language != 'zxx' && $language != 'und') {
                 $newField['s'][] = ['h' => $language];
             }
         }
         foreach ($originalLanguages as $language) {
             if (preg_match('/^\\w{3}$/', $language) && $language != 'zxx' && $language != 'und') {
                 $newField['s'][] = ['i' => $language];
             }
         }
         foreach ($subtitleLanguages as $language) {
             if (preg_match('/^\\w{3}$/', $language) && $language != 'zxx' && $language != 'und') {
                 $newField['s'][] = ['j' => $language];
             }
         }
         $key = MetadataUtils::createIdSortKey($id);
         $parts["{$key} {$count}"] = $newField;
         ++$count;
     }
     ksort($parts);
     $this->fields['979'] = array_values($parts);
     return $count;
 }