/** * Function: compare_auth * Purpose: Compares two authority strings * Author: Tony Rees (Tony.Rees@csiro.au) * Date created: March 2008 * Inputs: authority string 1 as auth1, authority string 1 as auth2 * Outputs: Numeric similarity value of the 2 strings using weighted n-gram analysis, * on 0-1 scale (1 = identical - typically after normalization; 0 = no similarity) * Remarks: * (1) Invokes function "normalize_auth" on both strings, to compare only normalized * versions of the same * (2) Returns blend of 2/3 bigram, 1/3 trigram similarity (bigrams better correspond * to intuitivesimilarity, however are insensitive to word order, i.e. "Smith et * Jones" = "Jones et Smith" without some trigram contribution) * (3) Returns blend of 50% similarity with, and 50% without, stripping of diacritical * marks - so that the contribution of the latter is reduced but not eliminated * (4) Is case insensitive (i.e. "de Saedeleer" = "De Saedeleer", etc.) * (5) Threshold between low / possible / good match is in the area of * 0-0.3 / 0.3-0.5 / 0.5+. * @param string $auth1 : authority string 1 * @param string $auth2 : authority string 2 * @return number : between 0 - 1 : (1 = identical - typically after normalization; 0 = no similarity) */ public function compare_auth($auth1 = NULL, $auth2 = NULL) { if ($auth1 == NULL || $auth2 == NULL) { return NULL; } else { $this->input = array($auth1, $auth2); $this->debug['compare_auth'][] = "Args: (auth1:{$auth1}) (auth2:{$auth2})"; $n = new Normalize($this->db); $new_auth1 = $n->normalize_auth($auth1); $new_auth2 = $n->normalize_auth($auth2); $this->debug['compare_auth'][] = "1 (new_auth1:{$new_auth1}) (new_auth2:{$new_auth2})"; if ($new_auth1 == $new_auth2) { $this_auth_match = 1; $this->debug['compare_auth'][] = "2a (this_auth_match:{$this_auth_match})"; } else { // create second versions without diacritical marks #$new_auth1b = $this->translate( $new_auth1, '¡…Õ”⁄¿»Ã“Ÿ¬ Œ‘€ƒÀœ÷‹√—’≈«ÿ', 'AEIOUAEIOUAEIOUAEIOUANOACO'); #$new_auth2b = $this->translate( $new_auth2, '¡…Õ”⁄¿»Ã“Ÿ¬ Œ‘€ƒÀœ÷‹√—’≈«ÿ', 'AEIOUAEIOUAEIOUAEIOUANOACO'); $new_auth1b = $n->utf8_to_ascii($new_auth1); $new_auth2b = $n->utf8_to_ascii($new_auth2); // weighted ngram comparison, use 67% n=2, 33% n=3 // use mean of versions with and without diacritical marks (to lessen their effect by 50%) $temp_auth_match1 = (2 * $this->ngram($new_auth1, $new_auth2, 2) + $this->ngram($new_auth1, $new_auth2, 3)) / 3; $temp_auth_match2 = 0; if ($new_auth1 == $new_auth1b && $new_auth2 == $new_auth2b) { $temp_auth_match2 = $temp_auth_match1; } else { $temp_auth_match2 = (2 * $this->ngram($new_auth1b, $new_auth2b, 2) + $this->ngram($new_auth1b, $new_auth2b, 3)) / 3; } $this->debug['compare_auth'][] = '2b (temp_auth_match1:$temp_auth_match1) (temp_auth_match2:$temp_auth_match2)'; $this_auth_match = ($temp_auth_match1 + $temp_auth_match2) / 2; $this->debug['compare_auth'][] = '2c (this_auth_match:$this_auth_match)'; } } $this_auth_match = round($this_auth_match, 4); $this->debug['compare_auth'][] = "Return: {$this_auth_match}"; /* $this->output = $this_auth_match; return( $this->output );*/ return $this_auth_match; }
require_once 'classes/class.normalize.php'; $norm = new Normalize(); $norm->set('debug_flag', $_REQUEST['debug']); $data = $norm->normalize($str); if ($output == 'xml') { $data = $norm->getXML(); } $debug = $norm->debug; break; case 'normalize_auth': require_once 'classes/class.normalize.php'; $db = select_source($source, $classification); $norm = new Normalize($db); $norm->set('post_fix', '_' . $source); $norm->set('source', $source); $data = $norm->normalize_auth($str); if ($output == 'xml') { $data = $norm->getXML(); } $debug = $norm->debug; break; case 'treat_word': require_once 'classes/class.nearmatch.php'; $nm = new NearMatch(); $strip_ending = $strip_ending == '' ? 0 : $strip_ending; $normalize = $normalize == '' ? 0 : $normalize; $data = $nm->treat_word($str, $strip_ending, $normalize); if ($output == 'xml') { $data = $nm->getXML(); } $debug = $nm->debug;