This is a factory class to build records for accessing metadata.
Author: Ere Maijala (ere.maijala@helsinki.fi)
 /**
  * Process a sample record
  *
  * @param string $sample Sample record file
  *
  * @return array SOLR record array
  */
 protected function processSample($sample)
 {
     $actualdir = dirname(__FILE__);
     $sample = file_get_contents($actualdir . "/../samples/" . $sample);
     $record = RecordFactory::createRecord($this->driver, $sample, "__unit_test_no_id__", "__unit_test_no_source__");
     return $record->toSolrArray();
 }
Beispiel #2
0
 /**
  * Prepare the prc file.
  *
  * @return Prc The file that can be used to be saved/downloaded
  */
 private function preparePRC()
 {
     if ($this->source === false) {
         throw new Exception('No data set');
     }
     if ($this->prc !== false) {
         return $this->prc;
     }
     $data = $this->source;
     $len = strlen($data);
     $settings = new Settings($this->optional);
     $rec = new RecordFactory($settings);
     $dataRecords = $rec->createRecords($data);
     $nRecords = sizeof($dataRecords);
     $mobiHeader = new PalmRecord($settings, $dataRecords, $nRecords, $len, sizeof($this->images));
     array_unshift($dataRecords, $mobiHeader);
     $dataRecords = array_merge($dataRecords, $this->images);
     $dataRecords[] = $rec->createFLISRecord();
     $dataRecords[] = $rec->createFCISRecord($len);
     $dataRecords[] = $rec->createEOFRecord();
     $this->prc = new Prc($settings, $dataRecords);
     return $this->prc;
 }
 /**
  * Creates a preview of the given metadata and returns it
  *
  * @param string $metadata The metadata to process
  * @param string $format   Metadata format
  * @param string $source   Source identifier
  *
  * @return array Solr record fields
  */
 public function preview($metadata, $format, $source)
 {
     if (!$source) {
         $source = "_preview";
     }
     /* Process data source preTransformation XSL if present
        TODO: duplicates code from RecordManager, refactor? */
     $settings = $this->settings[$source];
     if (isset($settings['preTransformation']) && $settings['preTransformation']) {
         $style = new DOMDocument();
         $style->load($this->basePath . '/transformations/' . $settings['preTransformation']);
         $xslt = new XSLTProcessor();
         $xslt->importStylesheet($style);
         $xslt->setParameter('', 'source_id', $source);
         $xslt->setParameter('', 'institution', $settings['institution']);
         $xslt->setParameter('', 'format', $format);
         $xslt->setParameter('', 'id_prefix', isset($settings['idPrefix']) && $settings['idPrefix'] ? $settings['idPrefix'] : $source);
         $doc = new DOMDocument();
         $doc->loadXML($metadata);
         $metadata = $xslt->transformToXml($doc);
     }
     $record = ['format' => $format, 'original_data' => $metadata, 'normalized_data' => $metadata, 'source_id' => $source, 'linking_id' => '', 'oai_id' => '_preview', '_id' => '_preview', 'created' => new MongoDate(), 'date' => new MongoDate()];
     // Normalize the record
     $this->normalizationXSLT = isset($settings['normalization']) && $settings['normalization'] ?: null;
     if (isset($settings['normalization'])) {
         $basePath = substr(__FILE__, 0, strrpos(__FILE__, DIRECTORY_SEPARATOR));
         $basePath = substr($basePath, 0, strrpos($basePath, DIRECTORY_SEPARATOR));
         $params = ['source_id' => $source, 'institution' => 'Preview', 'format' => $format, 'id_prefix' => ''];
         $normalizationXSLT = new XslTransformation($basePath . '/transformations', $settings['normalization'], $params);
         $record['normalized_data'] = $normalizationXSLT->transform($metadata, ['oai_id' => $record['oai_id']]);
     }
     $metadataRecord = RecordFactory::createRecord($record['format'], $record['normalized_data'], $record['oai_id'], $record['source_id']);
     $metadataRecord->normalize();
     $record['normalized_data'] = $metadataRecord->serialize();
     $record['_id'] = $source . '.' . $metadataRecord->getID();
     return $this->createSolrArray($record, $componentParts);
 }
 /**
  * Process a complete record set harvested e.g. from MetaLib
  *
  * @param string   $source           Source ID
  * @param string[] $harvestedRecords Array of records
  *
  * @return void
  */
 protected function processFullRecordSet($source, $harvestedRecords)
 {
     $this->log->log('processFullRecordSet', "[{$source}] Processing complete record set");
     // Create keyed array
     $records = [];
     foreach ($harvestedRecords as $record) {
         $marc = RecordFactory::createRecord('marc', $record, '', $source);
         $id = $marc->getID();
         $records["{$source}.{$id}"] = $record;
     }
     $this->log->log('processFullRecordSet', "[{$source}] Merging results with the records in database");
     $deleted = 0;
     $unchanged = 0;
     $changed = 0;
     $added = 0;
     $dbRecords = $this->db->record->find(['deleted' => false, 'source_id' => $source])->timeout($this->cursorTimeout);
     foreach ($dbRecords as $dbRecord) {
         $id = $dbRecord['_id'];
         if (!isset($records[$id])) {
             // Record not in harvested records, mark deleted
             $this->storeRecord($id, true, '');
             unset($records[$id]);
             ++$deleted;
             continue;
         }
         // Check if the record has changed
         $marc = RecordFactory::createRecord('marc', $records[$id], '', $source);
         if ($marc->serialize() != MetadataUtils::getRecordData($dbRecord, false)) {
             // Record changed, update...
             $this->storeRecord($id, false, $records[$id]);
             ++$changed;
         } else {
             ++$unchanged;
         }
         unset($records[$id]);
     }
     $this->log->log('processFullRecordSet', "[{$source}] Adding new records");
     foreach ($records as $id => $record) {
         $this->storeRecord($id, false, $record);
         ++$added;
     }
     $this->log->log('processFullRecordSet', "[{$source}] {$added} new, {$changed} changed, {$unchanged} unchanged and " . "{$deleted} deleted records processed");
 }
    /**
     * Create record XML
     *
     * @param array   $record          Mongo record
     * @param string  $format          Metadata format
     * @param boolean $includeMetadata Whether to include record data
     * (or only header)
     *
     * @return boolean|string
     */
    protected function createRecord($record, $format, $includeMetadata)
    {
        global $basePath;
        $sourceFormat = $record['format'];
        if (isset($this->formats[$format])) {
            $format = $this->formats[$format]['format'];
        }
        $metadata = '';
        if ($includeMetadata) {
            $mongodata = $record['normalized_data'] ? $record['normalized_data'] : $record['original_data'];
            $metadataRecord = RecordFactory::createRecord($record['format'], gzinflate($mongodata->bin), $record['oai_id'], $record['source_id']);
            $metadata = $metadataRecord->toXML();
            $key = "transformation_to_{$format}";
            $source = $record['source_id'];
            $datasource = $this->dataSourceSettings[$source];
            if ($sourceFormat != $format || isset($datasource[$key])) {
                if (!isset($datasource[$key])) {
                    $this->error('cannotDisseminateFormat', '');
                    return false;
                }
                $transformationKey = "{$key}_{$source}";
                if (!isset($this->transformations[$transformationKey])) {
                    $this->transformations[$transformationKey] = new XslTransformation($basePath . '/transformations', $datasource[$key]);
                }
                $params = ['source_id' => $source, 'institution' => $datasource['institution'], 'format' => $record['format']];
                $metadata = $this->transformations[$transformationKey]->transform($metadata, $params);
            }
            if (strncmp($metadata, '<?xml', 5) == 0) {
                $end = strpos($metadata, '>');
                $metadata = substr($metadata, $end + 1);
            }
            $metadata = <<<EOF
      <metadata>
        {$metadata}
      </metadata>

EOF;
        }
        $setSpecs = '';
        foreach ($this->getRecordSets($record) as $id) {
            $id = $this->escape($id);
            $setSpecs .= <<<EOF
        <setSpec>{$id}</setSpec>

EOF;
        }
        $id = $this->escape($record['oai_id']);
        $date = $this->toOaiDate($record['updated']->sec);
        $status = $record['deleted'] ? ' status="deleted"' : '';
        return <<<EOF
    <record>
      <header{$status}>
        <identifier>{$id}</identifier>
        <datestamp>{$date}</datestamp>
{$setSpecs}      </header>
{$metadata}    </record>

EOF;
    }
Beispiel #6
0
 /**
  * Deduplicate component parts of a record
  *
  * Component part deduplication is special. It will only go through
  * component parts of other records deduplicated with the host record
  * and stops when it finds a set of component parts that match.
  *
  * @param array $hostRecord Mongo record for the host record
  *
  * @return integer Number of component parts deduplicated
  */
 protected function dedupComponentParts($hostRecord)
 {
     if ($this->verbose) {
         echo "Deduplicating component parts\n";
     }
     if (!$hostRecord['linking_id']) {
         $this->log->log('dedupComponentParts', 'Linking ID missing from record ' . $hostRecord['_id'], Logger::ERROR);
         return 0;
     }
     $components1 = $this->getComponentPartsSorted($hostRecord['source_id'], $hostRecord['linking_id']);
     $component1count = count($components1);
     // Go through all other records with same dedup id and see if their
     // component parts match
     $marked = 0;
     $otherRecords = $this->db->record->find(['dedup_id' => $hostRecord['dedup_id'], 'deleted' => false])->timeout($this->cursorTimeout);
     foreach ($otherRecords as $otherRecord) {
         if ($otherRecord['source_id'] == $hostRecord['source_id']) {
             continue;
         }
         $components2 = $this->getComponentPartsSorted($otherRecord['source_id'], $otherRecord['linking_id']);
         $component2count = count($components2);
         if ($component1count != $component2count) {
             $allMatch = false;
         } else {
             $allMatch = true;
             $idx = -1;
             foreach ($components1 as $component1) {
                 $component2 = $components2[++$idx];
                 if ($this->verbose) {
                     echo "Comparing {$component1['_id']} with " . "{$component2['_id']}\n";
                 }
                 if ($this->verbose) {
                     echo 'Original ' . $component1['_id'] . ":\n" . MetadataUtils::getRecordData($component1, true) . "\n";
                 }
                 $metadataComponent1 = RecordFactory::createRecord($component1['format'], MetadataUtils::getRecordData($component1, true), $component1['oai_id'], $component1['source_id']);
                 if (!$this->matchRecords($component1, $metadataComponent1, $component2)) {
                     $allMatch = false;
                     break;
                 }
             }
         }
         if ($allMatch) {
             if ($this->verbose) {
                 echo microtime(true) . " All component parts match between " . "{$hostRecord['_id']} and {$otherRecord['_id']}\n";
             }
             $idx = -1;
             foreach ($components1 as $component1) {
                 $component2 = $components2[++$idx];
                 $this->markDuplicates($component1, $component2);
                 ++$marked;
             }
             break;
         } else {
             if ($this->verbose) {
                 echo microtime(true) . " Not all component parts match between " . "{$hostRecord['_id']} and {$otherRecord['_id']}\n";
             }
         }
     }
     return $marked;
 }
Beispiel #7
0
 /**
  * Create Solr array for the given record
  *
  * @param array   $record           Mongo record
  * @param integer $mergedComponents Number of component parts merged to the
  * record
  *
  * @return string[]
  * @throws Exception
  */
 protected function createSolrArray($record, &$mergedComponents)
 {
     global $configArray;
     $metadataRecord = RecordFactory::createRecord($record['format'], MetadataUtils::getRecordData($record, true), $record['oai_id'], $record['source_id']);
     $source = $record['source_id'];
     if (!isset($this->settings[$source])) {
         // Try to reload data source settings as they might have been updated
         // during a long run
         $this->loadDatasources();
         if (!isset($this->settings[$source])) {
             $this->log->log('createSolrArray', "No settings found for data source '{$source}', record " . "{$record['_id']}: " . $this->prettyPrint($record, true), Logger::FATAL);
             throw new Exception("No settings found for data source '{$source}'");
         }
     }
     $settings = $this->settings[$source];
     $hiddenComponent = false;
     if (isset($record['host_record_id'])) {
         if ($settings['componentParts'] == 'merge_all') {
             $hiddenComponent = true;
         } elseif ($settings['componentParts'] == 'merge_non_articles' || $settings['componentParts'] == 'merge_non_earticles') {
             $format = $metadataRecord->getFormat();
             if (!in_array($format, $this->allArticleFormats)) {
                 $hiddenComponent = true;
             } elseif (in_array($format, $this->articleFormats)) {
                 $hiddenComponent = true;
             }
         }
     }
     if ($hiddenComponent && !$settings['indexMergedParts']) {
         return false;
     }
     $hasComponentParts = false;
     $components = null;
     if (!isset($record['host_record_id'])) {
         // Fetch info whether component parts exist and need to be merged
         if (!$record['linking_id']) {
             $this->log->log('createSolrArray', "linking_id missing for record '{$record['_id']}'", Logger::ERROR);
         } else {
             $components = $this->db->record->find(['source_id' => $record['source_id'], 'host_record_id' => $record['linking_id'], 'deleted' => false])->timeout($this->cursorTimeout);
             $hasComponentParts = $components->hasNext();
             $format = $metadataRecord->getFormat();
             $merge = false;
             if ($settings['componentParts'] == 'merge_all') {
                 $merge = true;
             } elseif (!in_array($format, $this->allJournalFormats)) {
                 $merge = true;
             } elseif (in_array($format, $this->journalFormats) && $settings['componentParts'] == 'merge_non_earticles') {
                 $merge = true;
             }
             if (!$merge) {
                 unset($components);
             }
         }
     }
     if (isset($components)) {
         $mergedComponents += $metadataRecord->mergeComponentParts($components);
     }
     if (isset($settings['solrTransformationXSLT'])) {
         $params = ['source_id' => $source, 'institution' => $settings['institution'], 'format' => $settings['format'], 'id_prefix' => $settings['idPrefix']];
         $data = $settings['solrTransformationXSLT']->transformToSolrArray($metadataRecord->toXML(), $params);
     } else {
         $prependTitleWithSubtitle = isset($settings['prepend_title_with_subtitle']) ? $settings['prepend_title_with_subtitle'] : true;
         $data = $metadataRecord->toSolrArray($prependTitleWithSubtitle);
         $this->enrich($source, $settings, $metadataRecord, $data);
     }
     $data['id'] = $record['_id'];
     // Record links between host records and component parts
     if ($metadataRecord->getIsComponentPart()) {
         $hostRecord = null;
         if (isset($record['host_record_id']) && $this->db) {
             $hostRecord = $this->db->record->find(['source_id' => $record['source_id'], 'linking_id' => $record['host_record_id']])->limit(-1)->timeout($this->cursorTimeout)->getNext();
         }
         if (!$hostRecord) {
             if (isset($record['host_record_id'])) {
                 $this->log->log('createSolrArray', "Host record '" . $record['host_record_id'] . "' not found for record '" . $record['_id'] . "'", Logger::WARNING);
             }
             $data['container_title'] = $metadataRecord->getContainerTitle();
         } else {
             $data['hierarchy_parent_id'] = $hostRecord['_id'];
             $hostMetadataRecord = RecordFactory::createRecord($hostRecord['format'], MetadataUtils::getRecordData($hostRecord, true), $hostRecord['oai_id'], $hostRecord['source_id']);
             $data['container_title'] = $data['hierarchy_parent_title'] = $hostMetadataRecord->getTitle();
         }
         $data['container_volume'] = $metadataRecord->getVolume();
         $data['container_issue'] = $metadataRecord->getIssue();
         $data['container_start_page'] = $metadataRecord->getStartPage();
         $data['container_reference'] = $metadataRecord->getContainerReference();
     } else {
         // Add prefixes to hierarchy linking fields
         foreach (['hierarchy_top_id', 'hierarchy_parent_id', 'is_hierarchy_id'] as $field) {
             if (isset($data[$field]) && $data[$field]) {
                 $data[$field] = $record['source_id'] . '.' . $data[$field];
             }
         }
     }
     if ($hasComponentParts) {
         $data['is_hierarchy_id'] = $record['_id'];
         $data['is_hierarchy_title'] = $metadataRecord->getTitle();
     }
     if (!isset($data['institution'])) {
         $data['institution'] = $settings['institution'];
     }
     foreach ($settings['extraFields'] as $extraField) {
         $fieldName = key($extraField);
         $fieldValue = current($extraField);
         if (isset($data[$fieldName])) {
             if (!is_array($data[$fieldName])) {
                 $data[$fieldName] = [$data[$fieldName]];
             }
             $data[$fieldName][] = $fieldValue;
         } else {
             $data[$fieldName] = $fieldValue;
         }
     }
     // Map field values according to any mapping files
     foreach ($settings['mappingFiles'] as $field => $map) {
         if (isset($data[$field]) && !empty($data[$field])) {
             if (is_array($data[$field])) {
                 $newValues = null;
                 foreach ($data[$field] as $value) {
                     if (isset($map[$value])) {
                         $newValues = $map[$value];
                     } elseif (isset($map['##default'])) {
                         $newValues = $map['##default'];
                     }
                 }
                 if (null !== $newValues) {
                     if (is_array($newValues)) {
                         $data[$field] = array_values(array_unique($newValues));
                     } else {
                         $data[$field] = $newValues;
                     }
                 }
             } else {
                 if (isset($map[$data[$field]])) {
                     $data[$field] = $map[$data[$field]];
                 } elseif (isset($map['##default'])) {
                     $data[$field] = $map['##default'];
                 }
             }
         } elseif (isset($map['##empty'])) {
             $data[$field] = $map['##empty'];
         } elseif (isset($map['##emptyarray'])) {
             $data[$field] = [$map['##emptyarray']];
         }
     }
     // Special case: Special values for building (institution/location).
     // Used by default if building is set as a hierarchical facet.
     if ($this->buildingHierarchy || isset($settings['institutionInBuilding'])) {
         $useInstitution = isset($settings['institutionInBuilding']) ? $settings['institutionInBuilding'] : 'institution';
         switch ($useInstitution) {
             case 'driver':
                 $institutionCode = $data['institution'];
                 break;
             case 'none':
                 $institutionCode = '';
                 break;
             case 'source':
                 $institutionCode = $source;
                 break;
             case 'institution/source':
                 $institutionCode = $settings['institution'] . '/' . $source;
                 break;
             default:
                 $institutionCode = $settings['institution'];
                 break;
         }
         if ($institutionCode) {
             if (isset($data['building']) && $data['building']) {
                 if (is_array($data['building'])) {
                     foreach ($data['building'] as &$building) {
                         // Allow also empty values that might result from
                         // mapping tables
                         if ($building !== '') {
                             $building = "{$institutionCode}/{$building}";
                         }
                     }
                 } else {
                     $data['building'] = $institutionCode . '/' . $data['building'];
                 }
             } else {
                 $data['building'] = [$institutionCode];
             }
         }
     }
     // Hierarchical facets
     if (isset($configArray['Solr']['hierarchical_facets'])) {
         foreach ($configArray['Solr']['hierarchical_facets'] as $facet) {
             if (!isset($data[$facet])) {
                 continue;
             }
             $array = [];
             if (!is_array($data[$facet])) {
                 $data[$facet] = [$data[$facet]];
             }
             foreach ($data[$facet] as $datavalue) {
                 if ($datavalue === '') {
                     continue;
                 }
                 $values = explode('/', $datavalue);
                 $hierarchyString = '';
                 for ($i = 0; $i < count($values); $i++) {
                     $hierarchyString .= '/' . $values[$i];
                     $array[] = $i . $hierarchyString . '/';
                 }
             }
             $data[$facet] = $array;
         }
     }
     if (!isset($data['allfields'])) {
         $all = [];
         foreach ($data as $key => $field) {
             if (in_array($key, ['fullrecord', 'thumbnail', 'id', 'recordtype', 'ctrlnum'])) {
                 continue;
             }
             if (is_array($field)) {
                 $all = array_merge($all, $field);
             } else {
                 $all[] = $field;
             }
         }
         $data['allfields'] = MetadataUtils::array_iunique($all);
     }
     $data['first_indexed'] = MetadataUtils::formatTimestamp($record['created']->sec);
     $data['last_indexed'] = MetadataUtils::formatTimestamp($record['date']->sec);
     $data['recordtype'] = $record['format'];
     if (!isset($data['fullrecord'])) {
         $data['fullrecord'] = $metadataRecord->toXML();
     }
     if (!is_array($data['format'])) {
         $data['format'] = [$data['format']];
     }
     if (isset($configArray['Solr']['format_in_allfields']) && $configArray['Solr']['format_in_allfields']) {
         foreach ($data['format'] as $format) {
             // Replace numbers since they may be be considered word boundaries
             $data['allfields'][] = str_replace(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], ['ax', 'bx', 'cx', 'dx', 'ex', 'fx', 'gx', 'hx', 'ix', 'jx'], MetadataUtils::normalize($format));
         }
     }
     if ($hiddenComponent) {
         $data['hidden_component_boolean'] = true;
     }
     foreach ($data as $key => &$values) {
         if (is_array($values)) {
             foreach ($values as $key => &$value) {
                 $value = MetadataUtils::normalizeUnicode($value);
                 if (empty($value) || $value === 0 || $value === 0.0 || $value === '0') {
                     unset($values[$key]);
                 }
             }
             $values = array_values(array_unique($values));
         } elseif ($key != 'fullrecord') {
             $values = MetadataUtils::normalizeUnicode($values);
         }
         if (empty($values) || $values === 0 || $values === 0.0 || $values === '0') {
             unset($data[$key]);
         }
     }
     return $data;
 }