/** * Check if records are duplicate matches * * @param array $record Mongo record * @param object $origRecord Metadata record (from $record) * @param array $candidate Candidate Mongo record * * @return boolean */ protected function matchRecords($record, $origRecord, $candidate) { $cRecord = RecordFactory::createRecord($candidate['format'], MetadataUtils::getRecordData($candidate, true), $candidate['oai_id'], $candidate['source_id']); if ($this->verbose) { echo "\nCandidate " . $candidate['_id'] . ":\n" . MetadataUtils::getRecordData($candidate, true) . "\n"; } // Check that the record does not have access restrictions if ($cRecord->getAccessRestrictions()) { if ($this->verbose) { echo "--Candidate has access restrictions\n"; } return false; } // Check format $origFormat = $origRecord->getFormat(); $cFormat = $cRecord->getFormat(); $origMapped = $this->solrUpdater->mapFormat($record['source_id'], $origFormat); $cMapped = $this->solrUpdater->mapFormat($candidate['source_id'], $cFormat); if ($origFormat != $cFormat && $origMapped != $cMapped) { if ($this->verbose) { echo "--Format mismatch: {$origFormat} != {$cFormat} " . "and {$origMapped} != {$cMapped}\n"; } return false; } // Check for common ISBN $origISBNs = $origRecord->getISBNs(); $cISBNs = $cRecord->getISBNs(); $isect = array_intersect($origISBNs, $cISBNs); if (!empty($isect)) { // Shared ISBN -> match if ($this->verbose) { echo "++ISBN match:\n"; print_r($origISBNs); print_r($cISBNs); echo $origRecord->getFullTitle() . "\n"; echo $cRecord->getFullTitle() . "\n"; } return true; } // Check for other common ID (e.g. NBN) $origIDs = $origRecord->getUniqueIDs(); $cIDs = $cRecord->getUniqueIDs(); $isect = array_intersect($origIDs, $cIDs); if (!empty($isect)) { // Shared ID -> match if ($this->verbose) { echo "++ID match:\n"; print_r($origIDs); print_r($cIDs); echo $origRecord->getFullTitle() . "\n"; echo $cRecord->getFullTitle() . "\n"; } return true; } $origISSNs = $origRecord->getISSNs(); $cISSNs = $cRecord->getISSNs(); $commonISSNs = array_intersect($origISSNs, $cISSNs); if (!empty($origISSNs) && !empty($cISSNs) && empty($commonISSNs)) { // Both have ISSNs but none match if ($this->verbose) { echo "++ISSN mismatch:\n"; print_r($origISSNs); print_r($cISSNs); echo $origRecord->getFullTitle() . "\n"; echo $cRecord->getFullTitle() . "\n"; } return false; } $origYear = $origRecord->getPublicationYear(); $cYear = $cRecord->getPublicationYear(); if ($origYear && $cYear && $origYear != $cYear) { if ($this->verbose) { echo "--Year mismatch: {$origYear} != {$cYear}\n"; } return false; } $pages = $origRecord->getPageCount(); $cPages = $cRecord->getPageCount(); if ($pages && $cPages && abs($pages - $cPages) > 10) { if ($this->verbose) { echo "--Pages mismatch ({$pages} != {$cPages})\n"; } return false; } if ($origRecord->getSeriesISSN() != $cRecord->getSeriesISSN()) { return false; } if ($origRecord->getSeriesNumbering() != $cRecord->getSeriesNumbering()) { return false; } $origTitle = MetadataUtils::normalize($origRecord->getTitle(true)); $cTitle = MetadataUtils::normalize($cRecord->getTitle(true)); if (!$origTitle || !$cTitle) { // No title match without title... if ($this->verbose) { echo "No title - no further matching\n"; } return false; } $lev = levenshtein(substr($origTitle, 0, 255), substr($cTitle, 0, 255)); $lev = $lev / strlen($origTitle) * 100; if ($lev >= 10) { if ($this->verbose) { echo "--Title lev discard: {$lev}\nOriginal: {$origTitle}\n" . "Candidate: {$cTitle}\n"; } return false; } $origAuthor = MetadataUtils::normalize($origRecord->getMainAuthor()); $cAuthor = MetadataUtils::normalize($cRecord->getMainAuthor()); $authorLev = 0; if ($origAuthor || $cAuthor) { if (!$origAuthor || !$cAuthor) { if ($this->verbose) { echo "\nAuthor discard:\nOriginal: {$origAuthor}\n" . "Candidate: {$cAuthor}\n"; } return false; } if (!MetadataUtils::authorMatch($origAuthor, $cAuthor)) { $authorLev = levenshtein(substr($origAuthor, 0, 255), substr($cAuthor, 0, 255)); $authorLev = $authorLev / mb_strlen($origAuthor) * 100; if ($authorLev > 20) { if ($this->verbose) { echo "\nAuthor lev discard (lev: {$lev}, authorLev: " . "{$authorLev}):\nOriginal: {$origAuthor}\n" . "Candidate: {$cAuthor}\n"; } return false; } } } if ($this->verbose) { echo "\nTitle match (lev: {$lev}, authorLev: {$authorLev}):\n"; echo $origRecord->getFullTitle() . "\n"; echo " {$origAuthor} - {$origTitle}.\n"; echo $cRecord->getFullTitle() . "\n"; echo " {$cAuthor} - {$cTitle}.\n"; } // We have a match! return true; }