コード例 #1
0
ファイル: class.taxamatch.php プロジェクト: nmatasci/TNRS
 /**
  * Function: compare_auth
  * Purpose: Compares two authority strings
  * Author: Tony Rees (Tony.Rees@csiro.au)
  * Date created: March 2008
  * Inputs: authority string 1 as auth1, authority string 1 as auth2
  * Outputs: Numeric similarity value of the 2 strings using weighted n-gram analysis,
  *    on 0-1 scale (1 = identical - typically after normalization; 0 = no similarity)
  * Remarks:
  *   (1) Invokes function "normalize_auth" on both strings, to compare only normalized
  *         versions of the same
  *   (2) Returns blend of 2/3 bigram, 1/3 trigram similarity (bigrams better correspond
  *         to intuitivesimilarity, however are insensitive to word order, i.e. "Smith et
  *         Jones" = "Jones et Smith" without some trigram contribution)
  *   (3) Returns blend of 50% similarity with, and 50% without, stripping of diacritical
  *         marks - so that the contribution of the latter is reduced but not eliminated
  *   (4) Is case insensitive (i.e. "de Saedeleer" = "De Saedeleer", etc.)
  *   (5) Threshold between low / possible / good match is in the area of
  *         0-0.3 / 0.3-0.5 / 0.5+.
  * @param string $auth1 : authority string 1
  * @param string $auth2 : authority string 2
  * @return number : between 0 - 1 : (1 = identical - typically after normalization; 0 = no similarity)
  */
 public function compare_auth($auth1 = NULL, $auth2 = NULL)
 {
     if ($auth1 == NULL || $auth2 == NULL) {
         return NULL;
     } else {
         $this->input = array($auth1, $auth2);
         $this->debug['compare_auth'][] = "Args: (auth1:{$auth1}) (auth2:{$auth2})";
         $n = new Normalize($this->db);
         $new_auth1 = $n->normalize_auth($auth1);
         $new_auth2 = $n->normalize_auth($auth2);
         $this->debug['compare_auth'][] = "1 (new_auth1:{$new_auth1}) (new_auth2:{$new_auth2})";
         if ($new_auth1 == $new_auth2) {
             $this_auth_match = 1;
             $this->debug['compare_auth'][] = "2a (this_auth_match:{$this_auth_match})";
         } else {
             // create second versions without diacritical marks
             #$new_auth1b = $this->translate( $new_auth1, '¡…Õ”⁄¿»Ã“Ÿ¬ Œ‘€ƒÀœ÷‹√—’≈«ÿ', 'AEIOUAEIOUAEIOUAEIOUANOACO');
             #$new_auth2b = $this->translate( $new_auth2, '¡…Õ”⁄¿»Ã“Ÿ¬ Œ‘€ƒÀœ÷‹√—’≈«ÿ', 'AEIOUAEIOUAEIOUAEIOUANOACO');
             $new_auth1b = $n->utf8_to_ascii($new_auth1);
             $new_auth2b = $n->utf8_to_ascii($new_auth2);
             // weighted ngram comparison, use 67% n=2, 33% n=3
             // use mean of versions with and without diacritical marks (to lessen their effect by 50%)
             $temp_auth_match1 = (2 * $this->ngram($new_auth1, $new_auth2, 2) + $this->ngram($new_auth1, $new_auth2, 3)) / 3;
             $temp_auth_match2 = 0;
             if ($new_auth1 == $new_auth1b && $new_auth2 == $new_auth2b) {
                 $temp_auth_match2 = $temp_auth_match1;
             } else {
                 $temp_auth_match2 = (2 * $this->ngram($new_auth1b, $new_auth2b, 2) + $this->ngram($new_auth1b, $new_auth2b, 3)) / 3;
             }
             $this->debug['compare_auth'][] = '2b (temp_auth_match1:$temp_auth_match1) (temp_auth_match2:$temp_auth_match2)';
             $this_auth_match = ($temp_auth_match1 + $temp_auth_match2) / 2;
             $this->debug['compare_auth'][] = '2c (this_auth_match:$this_auth_match)';
         }
     }
     $this_auth_match = round($this_auth_match, 4);
     $this->debug['compare_auth'][] = "Return: {$this_auth_match}";
     /*			$this->output = $this_auth_match;
     	    return( $this->output );*/
     return $this_auth_match;
 }
コード例 #2
0
ファイル: taxamatch.php プロジェクト: kyleLesack/TNRS
     require_once 'classes/class.normalize.php';
     $norm = new Normalize();
     $norm->set('debug_flag', $_REQUEST['debug']);
     $data = $norm->normalize($str);
     if ($output == 'xml') {
         $data = $norm->getXML();
     }
     $debug = $norm->debug;
     break;
 case 'normalize_auth':
     require_once 'classes/class.normalize.php';
     $db = select_source($source, $classification);
     $norm = new Normalize($db);
     $norm->set('post_fix', '_' . $source);
     $norm->set('source', $source);
     $data = $norm->normalize_auth($str);
     if ($output == 'xml') {
         $data = $norm->getXML();
     }
     $debug = $norm->debug;
     break;
 case 'treat_word':
     require_once 'classes/class.nearmatch.php';
     $nm = new NearMatch();
     $strip_ending = $strip_ending == '' ? 0 : $strip_ending;
     $normalize = $normalize == '' ? 0 : $normalize;
     $data = $nm->treat_word($str, $strip_ending, $normalize);
     if ($output == 'xml') {
         $data = $nm->getXML();
     }
     $debug = $nm->debug;