Ejemplo n.º 1
0
 /**
  * Check if records are duplicate matches
  *
  * @param array  $record     Mongo record
  * @param object $origRecord Metadata record (from $record)
  * @param array  $candidate  Candidate Mongo record
  *
  * @return boolean
  */
 protected function matchRecords($record, $origRecord, $candidate)
 {
     $cRecord = RecordFactory::createRecord($candidate['format'], MetadataUtils::getRecordData($candidate, true), $candidate['oai_id'], $candidate['source_id']);
     if ($this->verbose) {
         echo "\nCandidate " . $candidate['_id'] . ":\n" . MetadataUtils::getRecordData($candidate, true) . "\n";
     }
     // Check that the record does not have access restrictions
     if ($cRecord->getAccessRestrictions()) {
         if ($this->verbose) {
             echo "--Candidate has access restrictions\n";
         }
         return false;
     }
     // Check format
     $origFormat = $origRecord->getFormat();
     $cFormat = $cRecord->getFormat();
     $origMapped = $this->solrUpdater->mapFormat($record['source_id'], $origFormat);
     $cMapped = $this->solrUpdater->mapFormat($candidate['source_id'], $cFormat);
     if ($origFormat != $cFormat && $origMapped != $cMapped) {
         if ($this->verbose) {
             echo "--Format mismatch: {$origFormat} != {$cFormat} " . "and {$origMapped} != {$cMapped}\n";
         }
         return false;
     }
     // Check for common ISBN
     $origISBNs = $origRecord->getISBNs();
     $cISBNs = $cRecord->getISBNs();
     $isect = array_intersect($origISBNs, $cISBNs);
     if (!empty($isect)) {
         // Shared ISBN -> match
         if ($this->verbose) {
             echo "++ISBN match:\n";
             print_r($origISBNs);
             print_r($cISBNs);
             echo $origRecord->getFullTitle() . "\n";
             echo $cRecord->getFullTitle() . "\n";
         }
         return true;
     }
     // Check for other common ID (e.g. NBN)
     $origIDs = $origRecord->getUniqueIDs();
     $cIDs = $cRecord->getUniqueIDs();
     $isect = array_intersect($origIDs, $cIDs);
     if (!empty($isect)) {
         // Shared ID -> match
         if ($this->verbose) {
             echo "++ID match:\n";
             print_r($origIDs);
             print_r($cIDs);
             echo $origRecord->getFullTitle() . "\n";
             echo $cRecord->getFullTitle() . "\n";
         }
         return true;
     }
     $origISSNs = $origRecord->getISSNs();
     $cISSNs = $cRecord->getISSNs();
     $commonISSNs = array_intersect($origISSNs, $cISSNs);
     if (!empty($origISSNs) && !empty($cISSNs) && empty($commonISSNs)) {
         // Both have ISSNs but none match
         if ($this->verbose) {
             echo "++ISSN mismatch:\n";
             print_r($origISSNs);
             print_r($cISSNs);
             echo $origRecord->getFullTitle() . "\n";
             echo $cRecord->getFullTitle() . "\n";
         }
         return false;
     }
     $origYear = $origRecord->getPublicationYear();
     $cYear = $cRecord->getPublicationYear();
     if ($origYear && $cYear && $origYear != $cYear) {
         if ($this->verbose) {
             echo "--Year mismatch: {$origYear} != {$cYear}\n";
         }
         return false;
     }
     $pages = $origRecord->getPageCount();
     $cPages = $cRecord->getPageCount();
     if ($pages && $cPages && abs($pages - $cPages) > 10) {
         if ($this->verbose) {
             echo "--Pages mismatch ({$pages} != {$cPages})\n";
         }
         return false;
     }
     if ($origRecord->getSeriesISSN() != $cRecord->getSeriesISSN()) {
         return false;
     }
     if ($origRecord->getSeriesNumbering() != $cRecord->getSeriesNumbering()) {
         return false;
     }
     $origTitle = MetadataUtils::normalize($origRecord->getTitle(true));
     $cTitle = MetadataUtils::normalize($cRecord->getTitle(true));
     if (!$origTitle || !$cTitle) {
         // No title match without title...
         if ($this->verbose) {
             echo "No title - no further matching\n";
         }
         return false;
     }
     $lev = levenshtein(substr($origTitle, 0, 255), substr($cTitle, 0, 255));
     $lev = $lev / strlen($origTitle) * 100;
     if ($lev >= 10) {
         if ($this->verbose) {
             echo "--Title lev discard: {$lev}\nOriginal:  {$origTitle}\n" . "Candidate: {$cTitle}\n";
         }
         return false;
     }
     $origAuthor = MetadataUtils::normalize($origRecord->getMainAuthor());
     $cAuthor = MetadataUtils::normalize($cRecord->getMainAuthor());
     $authorLev = 0;
     if ($origAuthor || $cAuthor) {
         if (!$origAuthor || !$cAuthor) {
             if ($this->verbose) {
                 echo "\nAuthor discard:\nOriginal:  {$origAuthor}\n" . "Candidate: {$cAuthor}\n";
             }
             return false;
         }
         if (!MetadataUtils::authorMatch($origAuthor, $cAuthor)) {
             $authorLev = levenshtein(substr($origAuthor, 0, 255), substr($cAuthor, 0, 255));
             $authorLev = $authorLev / mb_strlen($origAuthor) * 100;
             if ($authorLev > 20) {
                 if ($this->verbose) {
                     echo "\nAuthor lev discard (lev: {$lev}, authorLev: " . "{$authorLev}):\nOriginal:  {$origAuthor}\n" . "Candidate: {$cAuthor}\n";
                 }
                 return false;
             }
         }
     }
     if ($this->verbose) {
         echo "\nTitle match (lev: {$lev}, authorLev: {$authorLev}):\n";
         echo $origRecord->getFullTitle() . "\n";
         echo "   {$origAuthor} - {$origTitle}.\n";
         echo $cRecord->getFullTitle() . "\n";
         echo "   {$cAuthor} - {$cTitle}.\n";
     }
     // We have a match!
     return true;
 }