예제 #1
0
 /**
  * Dedup: Return unique IDs (control numbers)
  *
  * @return string[]
  */
 public function getUniqueIDs()
 {
     $arr = [];
     $nbn = $this->getField('015');
     if ($nbn) {
         $nr = MetadataUtils::normalize(strtok($this->getSubfield($nbn, 'a'), ' '));
         $src = $this->getSubfield($nbn, '2');
         if ($src && $nr) {
             $arr[] = "({$src}){$nr}";
         }
     }
     $nba = $this->getField('016');
     if ($nba) {
         $nr = MetadataUtils::normalize(strtok($this->getSubfield($nba, 'a'), ' '));
         $src = $this->getSubfield($nba, '2');
         if ($src && $nr) {
             $arr[] = "({$src}){$nr}";
         }
     }
     $id = $this->getField('024');
     if ($id) {
         $nr = MetadataUtils::normalize(strtok($this->getSubfield($id, 'a'), ' '));
         switch ($this->getIndicator($id, 1)) {
             case '0':
                 $src = 'istc';
                 break;
             case '1':
                 $src = 'upc';
                 break;
             case '2':
                 $src = 'ismn';
                 break;
             case '3':
                 $src = 'ian';
                 break;
             case '4':
                 $src = 'sici';
                 break;
             case '7':
                 $src = $this->getSubfield($id, '2');
                 break;
             default:
                 $src = '';
         }
         if ($src && $nr) {
             $arr[] = "({$src}){$nr}";
         }
     }
     return $arr;
 }
예제 #2
0
 /**
  * Check if records are duplicate matches
  *
  * @param array  $record     Mongo record
  * @param object $origRecord Metadata record (from $record)
  * @param array  $candidate  Candidate Mongo record
  *
  * @return boolean
  */
 protected function matchRecords($record, $origRecord, $candidate)
 {
     $cRecord = RecordFactory::createRecord($candidate['format'], MetadataUtils::getRecordData($candidate, true), $candidate['oai_id'], $candidate['source_id']);
     if ($this->verbose) {
         echo "\nCandidate " . $candidate['_id'] . ":\n" . MetadataUtils::getRecordData($candidate, true) . "\n";
     }
     // Check that the record does not have access restrictions
     if ($cRecord->getAccessRestrictions()) {
         if ($this->verbose) {
             echo "--Candidate has access restrictions\n";
         }
         return false;
     }
     // Check format
     $origFormat = $origRecord->getFormat();
     $cFormat = $cRecord->getFormat();
     $origMapped = $this->solrUpdater->mapFormat($record['source_id'], $origFormat);
     $cMapped = $this->solrUpdater->mapFormat($candidate['source_id'], $cFormat);
     if ($origFormat != $cFormat && $origMapped != $cMapped) {
         if ($this->verbose) {
             echo "--Format mismatch: {$origFormat} != {$cFormat} " . "and {$origMapped} != {$cMapped}\n";
         }
         return false;
     }
     // Check for common ISBN
     $origISBNs = $origRecord->getISBNs();
     $cISBNs = $cRecord->getISBNs();
     $isect = array_intersect($origISBNs, $cISBNs);
     if (!empty($isect)) {
         // Shared ISBN -> match
         if ($this->verbose) {
             echo "++ISBN match:\n";
             print_r($origISBNs);
             print_r($cISBNs);
             echo $origRecord->getFullTitle() . "\n";
             echo $cRecord->getFullTitle() . "\n";
         }
         return true;
     }
     // Check for other common ID (e.g. NBN)
     $origIDs = $origRecord->getUniqueIDs();
     $cIDs = $cRecord->getUniqueIDs();
     $isect = array_intersect($origIDs, $cIDs);
     if (!empty($isect)) {
         // Shared ID -> match
         if ($this->verbose) {
             echo "++ID match:\n";
             print_r($origIDs);
             print_r($cIDs);
             echo $origRecord->getFullTitle() . "\n";
             echo $cRecord->getFullTitle() . "\n";
         }
         return true;
     }
     $origISSNs = $origRecord->getISSNs();
     $cISSNs = $cRecord->getISSNs();
     $commonISSNs = array_intersect($origISSNs, $cISSNs);
     if (!empty($origISSNs) && !empty($cISSNs) && empty($commonISSNs)) {
         // Both have ISSNs but none match
         if ($this->verbose) {
             echo "++ISSN mismatch:\n";
             print_r($origISSNs);
             print_r($cISSNs);
             echo $origRecord->getFullTitle() . "\n";
             echo $cRecord->getFullTitle() . "\n";
         }
         return false;
     }
     $origYear = $origRecord->getPublicationYear();
     $cYear = $cRecord->getPublicationYear();
     if ($origYear && $cYear && $origYear != $cYear) {
         if ($this->verbose) {
             echo "--Year mismatch: {$origYear} != {$cYear}\n";
         }
         return false;
     }
     $pages = $origRecord->getPageCount();
     $cPages = $cRecord->getPageCount();
     if ($pages && $cPages && abs($pages - $cPages) > 10) {
         if ($this->verbose) {
             echo "--Pages mismatch ({$pages} != {$cPages})\n";
         }
         return false;
     }
     if ($origRecord->getSeriesISSN() != $cRecord->getSeriesISSN()) {
         return false;
     }
     if ($origRecord->getSeriesNumbering() != $cRecord->getSeriesNumbering()) {
         return false;
     }
     $origTitle = MetadataUtils::normalize($origRecord->getTitle(true));
     $cTitle = MetadataUtils::normalize($cRecord->getTitle(true));
     if (!$origTitle || !$cTitle) {
         // No title match without title...
         if ($this->verbose) {
             echo "No title - no further matching\n";
         }
         return false;
     }
     $lev = levenshtein(substr($origTitle, 0, 255), substr($cTitle, 0, 255));
     $lev = $lev / strlen($origTitle) * 100;
     if ($lev >= 10) {
         if ($this->verbose) {
             echo "--Title lev discard: {$lev}\nOriginal:  {$origTitle}\n" . "Candidate: {$cTitle}\n";
         }
         return false;
     }
     $origAuthor = MetadataUtils::normalize($origRecord->getMainAuthor());
     $cAuthor = MetadataUtils::normalize($cRecord->getMainAuthor());
     $authorLev = 0;
     if ($origAuthor || $cAuthor) {
         if (!$origAuthor || !$cAuthor) {
             if ($this->verbose) {
                 echo "\nAuthor discard:\nOriginal:  {$origAuthor}\n" . "Candidate: {$cAuthor}\n";
             }
             return false;
         }
         if (!MetadataUtils::authorMatch($origAuthor, $cAuthor)) {
             $authorLev = levenshtein(substr($origAuthor, 0, 255), substr($cAuthor, 0, 255));
             $authorLev = $authorLev / mb_strlen($origAuthor) * 100;
             if ($authorLev > 20) {
                 if ($this->verbose) {
                     echo "\nAuthor lev discard (lev: {$lev}, authorLev: " . "{$authorLev}):\nOriginal:  {$origAuthor}\n" . "Candidate: {$cAuthor}\n";
                 }
                 return false;
             }
         }
     }
     if ($this->verbose) {
         echo "\nTitle match (lev: {$lev}, authorLev: {$authorLev}):\n";
         echo $origRecord->getFullTitle() . "\n";
         echo "   {$origAuthor} - {$origTitle}.\n";
         echo $cRecord->getFullTitle() . "\n";
         echo "   {$cAuthor} - {$cTitle}.\n";
     }
     // We have a match!
     return true;
 }
예제 #3
0
 /**
  * Create a normalized title key for dedup
  *
  * @param string $title Title
  *
  * @return string
  */
 public static function createTitleKey($title)
 {
     $full = false;
     if (isset(MetadataUtils::$fullTitlePrefixes)) {
         $normalTitle = MetadataUtils::normalize($title);
         foreach (MetadataUtils::$fullTitlePrefixes as $prefix) {
             if (strncmp($normalTitle, $prefix, strlen($prefix)) === 0) {
                 $full = true;
                 break;
             }
         }
     }
     $words = explode(' ', $title);
     $longWords = 0;
     $key = '';
     $keyLen = 0;
     foreach ($words as $word) {
         $key .= $word;
         $wordLen = mb_strlen($word);
         if ($wordLen > 3) {
             ++$longWords;
         }
         $keyLen += $wordLen;
         // significant chars
         if (!$full && ($longWords > 3 || $keyLen > 35)) {
             break;
         } elseif ($full && $keyLen > 100) {
             break;
         }
     }
     return MetadataUtils::normalize($key);
 }
예제 #4
0
 /**
  * Create Solr array for the given record
  *
  * @param array   $record           Mongo record
  * @param integer $mergedComponents Number of component parts merged to the
  * record
  *
  * @return string[]
  * @throws Exception
  */
 protected function createSolrArray($record, &$mergedComponents)
 {
     global $configArray;
     $metadataRecord = RecordFactory::createRecord($record['format'], MetadataUtils::getRecordData($record, true), $record['oai_id'], $record['source_id']);
     $source = $record['source_id'];
     if (!isset($this->settings[$source])) {
         // Try to reload data source settings as they might have been updated
         // during a long run
         $this->loadDatasources();
         if (!isset($this->settings[$source])) {
             $this->log->log('createSolrArray', "No settings found for data source '{$source}', record " . "{$record['_id']}: " . $this->prettyPrint($record, true), Logger::FATAL);
             throw new Exception("No settings found for data source '{$source}'");
         }
     }
     $settings = $this->settings[$source];
     $hiddenComponent = false;
     if (isset($record['host_record_id'])) {
         if ($settings['componentParts'] == 'merge_all') {
             $hiddenComponent = true;
         } elseif ($settings['componentParts'] == 'merge_non_articles' || $settings['componentParts'] == 'merge_non_earticles') {
             $format = $metadataRecord->getFormat();
             if (!in_array($format, $this->allArticleFormats)) {
                 $hiddenComponent = true;
             } elseif (in_array($format, $this->articleFormats)) {
                 $hiddenComponent = true;
             }
         }
     }
     if ($hiddenComponent && !$settings['indexMergedParts']) {
         return false;
     }
     $hasComponentParts = false;
     $components = null;
     if (!isset($record['host_record_id'])) {
         // Fetch info whether component parts exist and need to be merged
         if (!$record['linking_id']) {
             $this->log->log('createSolrArray', "linking_id missing for record '{$record['_id']}'", Logger::ERROR);
         } else {
             $components = $this->db->record->find(['source_id' => $record['source_id'], 'host_record_id' => $record['linking_id'], 'deleted' => false])->timeout($this->cursorTimeout);
             $hasComponentParts = $components->hasNext();
             $format = $metadataRecord->getFormat();
             $merge = false;
             if ($settings['componentParts'] == 'merge_all') {
                 $merge = true;
             } elseif (!in_array($format, $this->allJournalFormats)) {
                 $merge = true;
             } elseif (in_array($format, $this->journalFormats) && $settings['componentParts'] == 'merge_non_earticles') {
                 $merge = true;
             }
             if (!$merge) {
                 unset($components);
             }
         }
     }
     if (isset($components)) {
         $mergedComponents += $metadataRecord->mergeComponentParts($components);
     }
     if (isset($settings['solrTransformationXSLT'])) {
         $params = ['source_id' => $source, 'institution' => $settings['institution'], 'format' => $settings['format'], 'id_prefix' => $settings['idPrefix']];
         $data = $settings['solrTransformationXSLT']->transformToSolrArray($metadataRecord->toXML(), $params);
     } else {
         $prependTitleWithSubtitle = isset($settings['prepend_title_with_subtitle']) ? $settings['prepend_title_with_subtitle'] : true;
         $data = $metadataRecord->toSolrArray($prependTitleWithSubtitle);
         $this->enrich($source, $settings, $metadataRecord, $data);
     }
     $data['id'] = $record['_id'];
     // Record links between host records and component parts
     if ($metadataRecord->getIsComponentPart()) {
         $hostRecord = null;
         if (isset($record['host_record_id']) && $this->db) {
             $hostRecord = $this->db->record->find(['source_id' => $record['source_id'], 'linking_id' => $record['host_record_id']])->limit(-1)->timeout($this->cursorTimeout)->getNext();
         }
         if (!$hostRecord) {
             if (isset($record['host_record_id'])) {
                 $this->log->log('createSolrArray', "Host record '" . $record['host_record_id'] . "' not found for record '" . $record['_id'] . "'", Logger::WARNING);
             }
             $data['container_title'] = $metadataRecord->getContainerTitle();
         } else {
             $data['hierarchy_parent_id'] = $hostRecord['_id'];
             $hostMetadataRecord = RecordFactory::createRecord($hostRecord['format'], MetadataUtils::getRecordData($hostRecord, true), $hostRecord['oai_id'], $hostRecord['source_id']);
             $data['container_title'] = $data['hierarchy_parent_title'] = $hostMetadataRecord->getTitle();
         }
         $data['container_volume'] = $metadataRecord->getVolume();
         $data['container_issue'] = $metadataRecord->getIssue();
         $data['container_start_page'] = $metadataRecord->getStartPage();
         $data['container_reference'] = $metadataRecord->getContainerReference();
     } else {
         // Add prefixes to hierarchy linking fields
         foreach (['hierarchy_top_id', 'hierarchy_parent_id', 'is_hierarchy_id'] as $field) {
             if (isset($data[$field]) && $data[$field]) {
                 $data[$field] = $record['source_id'] . '.' . $data[$field];
             }
         }
     }
     if ($hasComponentParts) {
         $data['is_hierarchy_id'] = $record['_id'];
         $data['is_hierarchy_title'] = $metadataRecord->getTitle();
     }
     if (!isset($data['institution'])) {
         $data['institution'] = $settings['institution'];
     }
     foreach ($settings['extraFields'] as $extraField) {
         $fieldName = key($extraField);
         $fieldValue = current($extraField);
         if (isset($data[$fieldName])) {
             if (!is_array($data[$fieldName])) {
                 $data[$fieldName] = [$data[$fieldName]];
             }
             $data[$fieldName][] = $fieldValue;
         } else {
             $data[$fieldName] = $fieldValue;
         }
     }
     // Map field values according to any mapping files
     foreach ($settings['mappingFiles'] as $field => $map) {
         if (isset($data[$field]) && !empty($data[$field])) {
             if (is_array($data[$field])) {
                 $newValues = null;
                 foreach ($data[$field] as $value) {
                     if (isset($map[$value])) {
                         $newValues = $map[$value];
                     } elseif (isset($map['##default'])) {
                         $newValues = $map['##default'];
                     }
                 }
                 if (null !== $newValues) {
                     if (is_array($newValues)) {
                         $data[$field] = array_values(array_unique($newValues));
                     } else {
                         $data[$field] = $newValues;
                     }
                 }
             } else {
                 if (isset($map[$data[$field]])) {
                     $data[$field] = $map[$data[$field]];
                 } elseif (isset($map['##default'])) {
                     $data[$field] = $map['##default'];
                 }
             }
         } elseif (isset($map['##empty'])) {
             $data[$field] = $map['##empty'];
         } elseif (isset($map['##emptyarray'])) {
             $data[$field] = [$map['##emptyarray']];
         }
     }
     // Special case: Special values for building (institution/location).
     // Used by default if building is set as a hierarchical facet.
     if ($this->buildingHierarchy || isset($settings['institutionInBuilding'])) {
         $useInstitution = isset($settings['institutionInBuilding']) ? $settings['institutionInBuilding'] : 'institution';
         switch ($useInstitution) {
             case 'driver':
                 $institutionCode = $data['institution'];
                 break;
             case 'none':
                 $institutionCode = '';
                 break;
             case 'source':
                 $institutionCode = $source;
                 break;
             case 'institution/source':
                 $institutionCode = $settings['institution'] . '/' . $source;
                 break;
             default:
                 $institutionCode = $settings['institution'];
                 break;
         }
         if ($institutionCode) {
             if (isset($data['building']) && $data['building']) {
                 if (is_array($data['building'])) {
                     foreach ($data['building'] as &$building) {
                         // Allow also empty values that might result from
                         // mapping tables
                         if ($building !== '') {
                             $building = "{$institutionCode}/{$building}";
                         }
                     }
                 } else {
                     $data['building'] = $institutionCode . '/' . $data['building'];
                 }
             } else {
                 $data['building'] = [$institutionCode];
             }
         }
     }
     // Hierarchical facets
     if (isset($configArray['Solr']['hierarchical_facets'])) {
         foreach ($configArray['Solr']['hierarchical_facets'] as $facet) {
             if (!isset($data[$facet])) {
                 continue;
             }
             $array = [];
             if (!is_array($data[$facet])) {
                 $data[$facet] = [$data[$facet]];
             }
             foreach ($data[$facet] as $datavalue) {
                 if ($datavalue === '') {
                     continue;
                 }
                 $values = explode('/', $datavalue);
                 $hierarchyString = '';
                 for ($i = 0; $i < count($values); $i++) {
                     $hierarchyString .= '/' . $values[$i];
                     $array[] = $i . $hierarchyString . '/';
                 }
             }
             $data[$facet] = $array;
         }
     }
     if (!isset($data['allfields'])) {
         $all = [];
         foreach ($data as $key => $field) {
             if (in_array($key, ['fullrecord', 'thumbnail', 'id', 'recordtype', 'ctrlnum'])) {
                 continue;
             }
             if (is_array($field)) {
                 $all = array_merge($all, $field);
             } else {
                 $all[] = $field;
             }
         }
         $data['allfields'] = MetadataUtils::array_iunique($all);
     }
     $data['first_indexed'] = MetadataUtils::formatTimestamp($record['created']->sec);
     $data['last_indexed'] = MetadataUtils::formatTimestamp($record['date']->sec);
     $data['recordtype'] = $record['format'];
     if (!isset($data['fullrecord'])) {
         $data['fullrecord'] = $metadataRecord->toXML();
     }
     if (!is_array($data['format'])) {
         $data['format'] = [$data['format']];
     }
     if (isset($configArray['Solr']['format_in_allfields']) && $configArray['Solr']['format_in_allfields']) {
         foreach ($data['format'] as $format) {
             // Replace numbers since they may be be considered word boundaries
             $data['allfields'][] = str_replace(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], ['ax', 'bx', 'cx', 'dx', 'ex', 'fx', 'gx', 'hx', 'ix', 'jx'], MetadataUtils::normalize($format));
         }
     }
     if ($hiddenComponent) {
         $data['hidden_component_boolean'] = true;
     }
     foreach ($data as $key => &$values) {
         if (is_array($values)) {
             foreach ($values as $key => &$value) {
                 $value = MetadataUtils::normalizeUnicode($value);
                 if (empty($value) || $value === 0 || $value === 0.0 || $value === '0') {
                     unset($values[$key]);
                 }
             }
             $values = array_values(array_unique($values));
         } elseif ($key != 'fullrecord') {
             $values = MetadataUtils::normalizeUnicode($values);
         }
         if (empty($values) || $values === 0 || $values === 0.0 || $values === '0') {
             unset($data[$key]);
         }
     }
     return $data;
 }