/** * Dedup: Return unique IDs (control numbers) * * @return string[] */ public function getUniqueIDs() { $arr = []; $nbn = $this->getField('015'); if ($nbn) { $nr = MetadataUtils::normalize(strtok($this->getSubfield($nbn, 'a'), ' ')); $src = $this->getSubfield($nbn, '2'); if ($src && $nr) { $arr[] = "({$src}){$nr}"; } } $nba = $this->getField('016'); if ($nba) { $nr = MetadataUtils::normalize(strtok($this->getSubfield($nba, 'a'), ' ')); $src = $this->getSubfield($nba, '2'); if ($src && $nr) { $arr[] = "({$src}){$nr}"; } } $id = $this->getField('024'); if ($id) { $nr = MetadataUtils::normalize(strtok($this->getSubfield($id, 'a'), ' ')); switch ($this->getIndicator($id, 1)) { case '0': $src = 'istc'; break; case '1': $src = 'upc'; break; case '2': $src = 'ismn'; break; case '3': $src = 'ian'; break; case '4': $src = 'sici'; break; case '7': $src = $this->getSubfield($id, '2'); break; default: $src = ''; } if ($src && $nr) { $arr[] = "({$src}){$nr}"; } } return $arr; }
/** * Check if records are duplicate matches * * @param array $record Mongo record * @param object $origRecord Metadata record (from $record) * @param array $candidate Candidate Mongo record * * @return boolean */ protected function matchRecords($record, $origRecord, $candidate) { $cRecord = RecordFactory::createRecord($candidate['format'], MetadataUtils::getRecordData($candidate, true), $candidate['oai_id'], $candidate['source_id']); if ($this->verbose) { echo "\nCandidate " . $candidate['_id'] . ":\n" . MetadataUtils::getRecordData($candidate, true) . "\n"; } // Check that the record does not have access restrictions if ($cRecord->getAccessRestrictions()) { if ($this->verbose) { echo "--Candidate has access restrictions\n"; } return false; } // Check format $origFormat = $origRecord->getFormat(); $cFormat = $cRecord->getFormat(); $origMapped = $this->solrUpdater->mapFormat($record['source_id'], $origFormat); $cMapped = $this->solrUpdater->mapFormat($candidate['source_id'], $cFormat); if ($origFormat != $cFormat && $origMapped != $cMapped) { if ($this->verbose) { echo "--Format mismatch: {$origFormat} != {$cFormat} " . "and {$origMapped} != {$cMapped}\n"; } return false; } // Check for common ISBN $origISBNs = $origRecord->getISBNs(); $cISBNs = $cRecord->getISBNs(); $isect = array_intersect($origISBNs, $cISBNs); if (!empty($isect)) { // Shared ISBN -> match if ($this->verbose) { echo "++ISBN match:\n"; print_r($origISBNs); print_r($cISBNs); echo $origRecord->getFullTitle() . "\n"; echo $cRecord->getFullTitle() . "\n"; } return true; } // Check for other common ID (e.g. NBN) $origIDs = $origRecord->getUniqueIDs(); $cIDs = $cRecord->getUniqueIDs(); $isect = array_intersect($origIDs, $cIDs); if (!empty($isect)) { // Shared ID -> match if ($this->verbose) { echo "++ID match:\n"; print_r($origIDs); print_r($cIDs); echo $origRecord->getFullTitle() . "\n"; echo $cRecord->getFullTitle() . "\n"; } return true; } $origISSNs = $origRecord->getISSNs(); $cISSNs = $cRecord->getISSNs(); $commonISSNs = array_intersect($origISSNs, $cISSNs); if (!empty($origISSNs) && !empty($cISSNs) && empty($commonISSNs)) { // Both have ISSNs but none match if ($this->verbose) { echo "++ISSN mismatch:\n"; print_r($origISSNs); print_r($cISSNs); echo $origRecord->getFullTitle() . "\n"; echo $cRecord->getFullTitle() . "\n"; } return false; } $origYear = $origRecord->getPublicationYear(); $cYear = $cRecord->getPublicationYear(); if ($origYear && $cYear && $origYear != $cYear) { if ($this->verbose) { echo "--Year mismatch: {$origYear} != {$cYear}\n"; } return false; } $pages = $origRecord->getPageCount(); $cPages = $cRecord->getPageCount(); if ($pages && $cPages && abs($pages - $cPages) > 10) { if ($this->verbose) { echo "--Pages mismatch ({$pages} != {$cPages})\n"; } return false; } if ($origRecord->getSeriesISSN() != $cRecord->getSeriesISSN()) { return false; } if ($origRecord->getSeriesNumbering() != $cRecord->getSeriesNumbering()) { return false; } $origTitle = MetadataUtils::normalize($origRecord->getTitle(true)); $cTitle = MetadataUtils::normalize($cRecord->getTitle(true)); if (!$origTitle || !$cTitle) { // No title match without title... if ($this->verbose) { echo "No title - no further matching\n"; } return false; } $lev = levenshtein(substr($origTitle, 0, 255), substr($cTitle, 0, 255)); $lev = $lev / strlen($origTitle) * 100; if ($lev >= 10) { if ($this->verbose) { echo "--Title lev discard: {$lev}\nOriginal: {$origTitle}\n" . "Candidate: {$cTitle}\n"; } return false; } $origAuthor = MetadataUtils::normalize($origRecord->getMainAuthor()); $cAuthor = MetadataUtils::normalize($cRecord->getMainAuthor()); $authorLev = 0; if ($origAuthor || $cAuthor) { if (!$origAuthor || !$cAuthor) { if ($this->verbose) { echo "\nAuthor discard:\nOriginal: {$origAuthor}\n" . "Candidate: {$cAuthor}\n"; } return false; } if (!MetadataUtils::authorMatch($origAuthor, $cAuthor)) { $authorLev = levenshtein(substr($origAuthor, 0, 255), substr($cAuthor, 0, 255)); $authorLev = $authorLev / mb_strlen($origAuthor) * 100; if ($authorLev > 20) { if ($this->verbose) { echo "\nAuthor lev discard (lev: {$lev}, authorLev: " . "{$authorLev}):\nOriginal: {$origAuthor}\n" . "Candidate: {$cAuthor}\n"; } return false; } } } if ($this->verbose) { echo "\nTitle match (lev: {$lev}, authorLev: {$authorLev}):\n"; echo $origRecord->getFullTitle() . "\n"; echo " {$origAuthor} - {$origTitle}.\n"; echo $cRecord->getFullTitle() . "\n"; echo " {$cAuthor} - {$cTitle}.\n"; } // We have a match! return true; }
/** * Create a normalized title key for dedup * * @param string $title Title * * @return string */ public static function createTitleKey($title) { $full = false; if (isset(MetadataUtils::$fullTitlePrefixes)) { $normalTitle = MetadataUtils::normalize($title); foreach (MetadataUtils::$fullTitlePrefixes as $prefix) { if (strncmp($normalTitle, $prefix, strlen($prefix)) === 0) { $full = true; break; } } } $words = explode(' ', $title); $longWords = 0; $key = ''; $keyLen = 0; foreach ($words as $word) { $key .= $word; $wordLen = mb_strlen($word); if ($wordLen > 3) { ++$longWords; } $keyLen += $wordLen; // significant chars if (!$full && ($longWords > 3 || $keyLen > 35)) { break; } elseif ($full && $keyLen > 100) { break; } } return MetadataUtils::normalize($key); }
/** * Create Solr array for the given record * * @param array $record Mongo record * @param integer $mergedComponents Number of component parts merged to the * record * * @return string[] * @throws Exception */ protected function createSolrArray($record, &$mergedComponents) { global $configArray; $metadataRecord = RecordFactory::createRecord($record['format'], MetadataUtils::getRecordData($record, true), $record['oai_id'], $record['source_id']); $source = $record['source_id']; if (!isset($this->settings[$source])) { // Try to reload data source settings as they might have been updated // during a long run $this->loadDatasources(); if (!isset($this->settings[$source])) { $this->log->log('createSolrArray', "No settings found for data source '{$source}', record " . "{$record['_id']}: " . $this->prettyPrint($record, true), Logger::FATAL); throw new Exception("No settings found for data source '{$source}'"); } } $settings = $this->settings[$source]; $hiddenComponent = false; if (isset($record['host_record_id'])) { if ($settings['componentParts'] == 'merge_all') { $hiddenComponent = true; } elseif ($settings['componentParts'] == 'merge_non_articles' || $settings['componentParts'] == 'merge_non_earticles') { $format = $metadataRecord->getFormat(); if (!in_array($format, $this->allArticleFormats)) { $hiddenComponent = true; } elseif (in_array($format, $this->articleFormats)) { $hiddenComponent = true; } } } if ($hiddenComponent && !$settings['indexMergedParts']) { return false; } $hasComponentParts = false; $components = null; if (!isset($record['host_record_id'])) { // Fetch info whether component parts exist and need to be merged if (!$record['linking_id']) { $this->log->log('createSolrArray', "linking_id missing for record '{$record['_id']}'", Logger::ERROR); } else { $components = $this->db->record->find(['source_id' => $record['source_id'], 'host_record_id' => $record['linking_id'], 'deleted' => false])->timeout($this->cursorTimeout); $hasComponentParts = $components->hasNext(); $format = $metadataRecord->getFormat(); $merge = false; if ($settings['componentParts'] == 'merge_all') { $merge = true; } elseif (!in_array($format, $this->allJournalFormats)) { $merge = true; } elseif (in_array($format, $this->journalFormats) && $settings['componentParts'] == 'merge_non_earticles') { $merge = true; } if (!$merge) { unset($components); } } } if (isset($components)) { $mergedComponents += $metadataRecord->mergeComponentParts($components); } if (isset($settings['solrTransformationXSLT'])) { $params = ['source_id' => $source, 'institution' => $settings['institution'], 'format' => $settings['format'], 'id_prefix' => $settings['idPrefix']]; $data = $settings['solrTransformationXSLT']->transformToSolrArray($metadataRecord->toXML(), $params); } else { $prependTitleWithSubtitle = isset($settings['prepend_title_with_subtitle']) ? $settings['prepend_title_with_subtitle'] : true; $data = $metadataRecord->toSolrArray($prependTitleWithSubtitle); $this->enrich($source, $settings, $metadataRecord, $data); } $data['id'] = $record['_id']; // Record links between host records and component parts if ($metadataRecord->getIsComponentPart()) { $hostRecord = null; if (isset($record['host_record_id']) && $this->db) { $hostRecord = $this->db->record->find(['source_id' => $record['source_id'], 'linking_id' => $record['host_record_id']])->limit(-1)->timeout($this->cursorTimeout)->getNext(); } if (!$hostRecord) { if (isset($record['host_record_id'])) { $this->log->log('createSolrArray', "Host record '" . $record['host_record_id'] . "' not found for record '" . $record['_id'] . "'", Logger::WARNING); } $data['container_title'] = $metadataRecord->getContainerTitle(); } else { $data['hierarchy_parent_id'] = $hostRecord['_id']; $hostMetadataRecord = RecordFactory::createRecord($hostRecord['format'], MetadataUtils::getRecordData($hostRecord, true), $hostRecord['oai_id'], $hostRecord['source_id']); $data['container_title'] = $data['hierarchy_parent_title'] = $hostMetadataRecord->getTitle(); } $data['container_volume'] = $metadataRecord->getVolume(); $data['container_issue'] = $metadataRecord->getIssue(); $data['container_start_page'] = $metadataRecord->getStartPage(); $data['container_reference'] = $metadataRecord->getContainerReference(); } else { // Add prefixes to hierarchy linking fields foreach (['hierarchy_top_id', 'hierarchy_parent_id', 'is_hierarchy_id'] as $field) { if (isset($data[$field]) && $data[$field]) { $data[$field] = $record['source_id'] . '.' . $data[$field]; } } } if ($hasComponentParts) { $data['is_hierarchy_id'] = $record['_id']; $data['is_hierarchy_title'] = $metadataRecord->getTitle(); } if (!isset($data['institution'])) { $data['institution'] = $settings['institution']; } foreach ($settings['extraFields'] as $extraField) { $fieldName = key($extraField); $fieldValue = current($extraField); if (isset($data[$fieldName])) { if (!is_array($data[$fieldName])) { $data[$fieldName] = [$data[$fieldName]]; } $data[$fieldName][] = $fieldValue; } else { $data[$fieldName] = $fieldValue; } } // Map field values according to any mapping files foreach ($settings['mappingFiles'] as $field => $map) { if (isset($data[$field]) && !empty($data[$field])) { if (is_array($data[$field])) { $newValues = null; foreach ($data[$field] as $value) { if (isset($map[$value])) { $newValues = $map[$value]; } elseif (isset($map['##default'])) { $newValues = $map['##default']; } } if (null !== $newValues) { if (is_array($newValues)) { $data[$field] = array_values(array_unique($newValues)); } else { $data[$field] = $newValues; } } } else { if (isset($map[$data[$field]])) { $data[$field] = $map[$data[$field]]; } elseif (isset($map['##default'])) { $data[$field] = $map['##default']; } } } elseif (isset($map['##empty'])) { $data[$field] = $map['##empty']; } elseif (isset($map['##emptyarray'])) { $data[$field] = [$map['##emptyarray']]; } } // Special case: Special values for building (institution/location). // Used by default if building is set as a hierarchical facet. if ($this->buildingHierarchy || isset($settings['institutionInBuilding'])) { $useInstitution = isset($settings['institutionInBuilding']) ? $settings['institutionInBuilding'] : 'institution'; switch ($useInstitution) { case 'driver': $institutionCode = $data['institution']; break; case 'none': $institutionCode = ''; break; case 'source': $institutionCode = $source; break; case 'institution/source': $institutionCode = $settings['institution'] . '/' . $source; break; default: $institutionCode = $settings['institution']; break; } if ($institutionCode) { if (isset($data['building']) && $data['building']) { if (is_array($data['building'])) { foreach ($data['building'] as &$building) { // Allow also empty values that might result from // mapping tables if ($building !== '') { $building = "{$institutionCode}/{$building}"; } } } else { $data['building'] = $institutionCode . '/' . $data['building']; } } else { $data['building'] = [$institutionCode]; } } } // Hierarchical facets if (isset($configArray['Solr']['hierarchical_facets'])) { foreach ($configArray['Solr']['hierarchical_facets'] as $facet) { if (!isset($data[$facet])) { continue; } $array = []; if (!is_array($data[$facet])) { $data[$facet] = [$data[$facet]]; } foreach ($data[$facet] as $datavalue) { if ($datavalue === '') { continue; } $values = explode('/', $datavalue); $hierarchyString = ''; for ($i = 0; $i < count($values); $i++) { $hierarchyString .= '/' . $values[$i]; $array[] = $i . $hierarchyString . '/'; } } $data[$facet] = $array; } } if (!isset($data['allfields'])) { $all = []; foreach ($data as $key => $field) { if (in_array($key, ['fullrecord', 'thumbnail', 'id', 'recordtype', 'ctrlnum'])) { continue; } if (is_array($field)) { $all = array_merge($all, $field); } else { $all[] = $field; } } $data['allfields'] = MetadataUtils::array_iunique($all); } $data['first_indexed'] = MetadataUtils::formatTimestamp($record['created']->sec); $data['last_indexed'] = MetadataUtils::formatTimestamp($record['date']->sec); $data['recordtype'] = $record['format']; if (!isset($data['fullrecord'])) { $data['fullrecord'] = $metadataRecord->toXML(); } if (!is_array($data['format'])) { $data['format'] = [$data['format']]; } if (isset($configArray['Solr']['format_in_allfields']) && $configArray['Solr']['format_in_allfields']) { foreach ($data['format'] as $format) { // Replace numbers since they may be be considered word boundaries $data['allfields'][] = str_replace(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], ['ax', 'bx', 'cx', 'dx', 'ex', 'fx', 'gx', 'hx', 'ix', 'jx'], MetadataUtils::normalize($format)); } } if ($hiddenComponent) { $data['hidden_component_boolean'] = true; } foreach ($data as $key => &$values) { if (is_array($values)) { foreach ($values as $key => &$value) { $value = MetadataUtils::normalizeUnicode($value); if (empty($value) || $value === 0 || $value === 0.0 || $value === '0') { unset($values[$key]); } } $values = array_values(array_unique($values)); } elseif ($key != 'fullrecord') { $values = MetadataUtils::normalizeUnicode($values); } if (empty($values) || $values === 0 || $values === 0.0 || $values === '0') { unset($data[$key]); } } return $data; }