function normalize($string) { require_once 'classes/class.normalize.php'; $norm = new Normalize(); $norm->set('debug_flag', false); $data = $norm->normalize($string); return $data; }
/** * Function: compare_auth * Purpose: Compares two authority strings * Author: Tony Rees (Tony.Rees@csiro.au) * Date created: March 2008 * Inputs: authority string 1 as auth1, authority string 1 as auth2 * Outputs: Numeric similarity value of the 2 strings using weighted n-gram analysis, * on 0-1 scale (1 = identical - typically after normalization; 0 = no similarity) * Remarks: * (1) Invokes function "normalize_auth" on both strings, to compare only normalized * versions of the same * (2) Returns blend of 2/3 bigram, 1/3 trigram similarity (bigrams better correspond * to intuitivesimilarity, however are insensitive to word order, i.e. "Smith et * Jones" = "Jones et Smith" without some trigram contribution) * (3) Returns blend of 50% similarity with, and 50% without, stripping of diacritical * marks - so that the contribution of the latter is reduced but not eliminated * (4) Is case insensitive (i.e. "de Saedeleer" = "De Saedeleer", etc.) * (5) Threshold between low / possible / good match is in the area of * 0-0.3 / 0.3-0.5 / 0.5+. * @param string $auth1 : authority string 1 * @param string $auth2 : authority string 2 * @return number : between 0 - 1 : (1 = identical - typically after normalization; 0 = no similarity) */ public function compare_auth($auth1 = NULL, $auth2 = NULL) { if ($auth1 == NULL || $auth2 == NULL) { return NULL; } else { $this->input = array($auth1, $auth2); $this->debug['compare_auth'][] = "Args: (auth1:{$auth1}) (auth2:{$auth2})"; $n = new Normalize($this->db); $new_auth1 = $n->normalize_auth($auth1); $new_auth2 = $n->normalize_auth($auth2); $this->debug['compare_auth'][] = "1 (new_auth1:{$new_auth1}) (new_auth2:{$new_auth2})"; if ($new_auth1 == $new_auth2) { $this_auth_match = 1; $this->debug['compare_auth'][] = "2a (this_auth_match:{$this_auth_match})"; } else { // create second versions without diacritical marks #$new_auth1b = $this->translate( $new_auth1, '¡…Õ”⁄¿»Ã“Ÿ¬ Œ‘€ƒÀœ÷‹√—’≈«ÿ', 'AEIOUAEIOUAEIOUAEIOUANOACO'); #$new_auth2b = $this->translate( $new_auth2, '¡…Õ”⁄¿»Ã“Ÿ¬ Œ‘€ƒÀœ÷‹√—’≈«ÿ', 'AEIOUAEIOUAEIOUAEIOUANOACO'); $new_auth1b = $n->utf8_to_ascii($new_auth1); $new_auth2b = $n->utf8_to_ascii($new_auth2); // weighted ngram comparison, use 67% n=2, 33% n=3 // use mean of versions with and without diacritical marks (to lessen their effect by 50%) $temp_auth_match1 = (2 * $this->ngram($new_auth1, $new_auth2, 2) + $this->ngram($new_auth1, $new_auth2, 3)) / 3; $temp_auth_match2 = 0; if ($new_auth1 == $new_auth1b && $new_auth2 == $new_auth2b) { $temp_auth_match2 = $temp_auth_match1; } else { $temp_auth_match2 = (2 * $this->ngram($new_auth1b, $new_auth2b, 2) + $this->ngram($new_auth1b, $new_auth2b, 3)) / 3; } $this->debug['compare_auth'][] = '2b (temp_auth_match1:$temp_auth_match1) (temp_auth_match2:$temp_auth_match2)'; $this_auth_match = ($temp_auth_match1 + $temp_auth_match2) / 2; $this->debug['compare_auth'][] = '2c (this_auth_match:$this_auth_match)'; } } $this_auth_match = round($this_auth_match, 4); $this->debug['compare_auth'][] = "Return: {$this_auth_match}"; /* $this->output = $this_auth_match; return( $this->output );*/ return $this_auth_match; }
$norm = new Normalize(); $nm = new NearMatch(); $search_genus_name = $norm->normalize($genus_desc); $near_match_genus = $nm->near_match($genus_desc); $query = sprintf("INSERT INTO `genlist%s` (`GENUS_ID`, `GENUS`, `AUTHORITY`, `GEN_LENGTH`, `NEAR_MATCH_GENUS`, `SEARCH_GENUS_NAME`) VALUES ('%s','%s','%s',%s,'%s','%s')", mysql_escape_string($postfix), mysql_escape_string($genus_id_desc), mysql_escape_string($genus_desc), mysql_escape_string($authority_desc), mysql_escape_string($genus_length), mysql_escape_string($near_match_genus), mysql_escape_string($search_genus_name)); $db->query($query); } elseif (trim($data[3]) == '') { // Used to slow down the script for shared hosted sites usleep(20000); // Create Species $species_id_desc = $data[$species_id]; $species_desc = $data[$species]; $species_length = strlen($species_desc); $genus_id_desc = $data[1]; $genus_desc = $master[$data[1]]; $norm = new Normalize(); $nm = new NearMatch(); $search_species_name = $norm->normalize($species_desc); $near_match_species = $nm->near_match($species_desc); $authority_desc = $data[$authority]; $query = sprintf("INSERT INTO `splist%s` (`SPECIES_ID`, `GENUS_ORIG`, `SPECIES`, `GENUS_ID`, `AUTHORITY`, `SP_LENGTH`, `NEAR_MATCH_SPECIES`, `SEARCH_SPECIES_NAME`) VALUES ('%s','%s','%s','%s','%s',%s,'%s','%s') ", mysql_escape_string($postfix), mysql_escape_string($species_id_desc), mysql_escape_string($genus_desc), mysql_escape_string($species_desc), mysql_escape_string($genus_id_desc), mysql_escape_string($authority_desc), mysql_escape_string($species_length), mysql_escape_string($near_match_species), mysql_escape_string($search_species_name)); $db->query($query); } unset($query); } // end while fclose($handle); print round(memory_get_usage() * 0.0009) . "KB - Final Memory Used<br>"; } } } else {
/** * Function to treat the word * @param string $str2 * @param integer $strip_ending * @param integer $normalize : 1 -> normalize the word , 0 -> no normalization action * @return string */ public function treat_word($str2, $strip_ending = 0, $normalize = 1) { $temp2 = ''; $start_letter = ''; $next_char = ''; $result2 = ''; $this->input = $str2; $this->output = ''; if ($str2 == NULL || trim($str2) == '') { return ''; } else { if ($normalize) { $n = new Normalize(); $temp2 = $n->normalize($str2); $this->debug['Normalize'][] = $n->debug; } else { $temp2 = $str2; } $this->debug['TW'][] = "1 (temp2:{$temp2})"; // Do some selective replacement on the leading letter/s only: ('soundalikes') if (preg_match('/^AE/', $temp2)) { $temp2 = preg_replace('/^AE/', 'E', $temp2); } elseif (preg_match('/^CN/', $temp2)) { $temp2 = preg_replace('/^CN/', 'N', $temp2); } elseif (preg_match('/^CT/', $temp2)) { $temp2 = preg_replace('/^CT/', 'Z', $temp2); } elseif (preg_match('/^CZ/', $temp2)) { $temp2 = preg_replace('/^CZ/', 'V', $temp2); } elseif (preg_match('/^DJ/', $temp2)) { $temp2 = preg_replace('/^DJ/', 'J', $temp2); } elseif (preg_match('/^EA/', $temp2)) { $temp2 = preg_replace('/^EA/', 'E', $temp2); } elseif (preg_match('/^EU/', $temp2)) { $temp2 = preg_replace('/^EU/', 'U', $temp2); } elseif (preg_match('/^GN/', $temp2)) { $temp2 = preg_replace('/^GN/', 'N', $temp2); } elseif (preg_match('/^KN/', $temp2)) { $temp2 = preg_replace('/^KN/', 'N', $temp2); } elseif (preg_match('/^MC/', $temp2)) { $temp2 = preg_replace('/^MC/', 'MAC', $temp2); } elseif (preg_match('/^MN/', $temp2)) { $temp2 = preg_replace('/^MN/', 'N', $temp2); } elseif (preg_match('/^OE/', $temp2)) { $temp2 = preg_replace('/^OE/', 'E', $temp2); } elseif (preg_match('/^QU/', $temp2)) { $temp2 = preg_replace('/^QU/', 'Q', $temp2); } elseif (preg_match('/^PS/', $temp2)) { $temp2 = preg_replace('/^PS/', 'S', $temp2); } elseif (preg_match('/^PT/', $temp2)) { $temp2 = preg_replace('/^PT/', 'T', $temp2); } elseif (preg_match('/^TS/', $temp2)) { $temp2 = preg_replace('/^TS/', 'S', $temp2); } elseif (preg_match('/^WR/', $temp2)) { $temp2 = preg_replace('/^WR/', 'R', $temp2); } elseif (preg_match('/^X/', $temp2)) { $temp2 = preg_replace('/^X/', 'Z', $temp2); } elseif (preg_match('/^ph/', $temp2)) { $temp2 = preg_replace('/^ph/', 'f', $temp2); } $this->debug['TW'][] = "2 (temp2:{$temp2})"; // Now keep the leading character, then do selected "soundalike" replacements. The // following letters are equated: AE, OE, E, U, Y and I; IA and A are equated; // K and C; Z and S; and H is dropped. Also, A and O are equated, MAC and MC are equated, and SC and S. $start_letter = substr($temp2, 0, 1); // quarantine the leading letter $temp2 = substr($temp2, 1); // snip off the leading letter $this->debug['TW'][] = "3 (start_letter:{$start_letter}) (temp2:{$temp2})"; // now do the replacements $temp2 = str_ireplace('AE', 'I', $temp2); $temp2 = str_ireplace('IA', 'A', $temp2); $temp2 = str_ireplace('OE', 'I', $temp2); $temp2 = str_ireplace('OI', 'A', $temp2); $temp2 = str_ireplace('SC', 'S', $temp2); $temp2 = str_ireplace('E', 'I', $temp2); $temp2 = str_ireplace('O', 'A', $temp2); $temp2 = str_ireplace('U', 'I', $temp2); $temp2 = str_ireplace('Y', 'I', $temp2); $temp2 = str_ireplace('K', 'C', $temp2); $temp2 = str_ireplace('Z', 'S', $temp2); $temp2 = str_ireplace('H', '', $temp2); // $temp2 = str_ireplace ('io', 'a', $temp2); // Not used in taxamatch? // $temp2 = str_ireplace ('ou', 'u', $temp2); // Not used in taxamatch? // $temp2 = str_ireplace ('ph', 'f', $temp2); // Not used in taxamatch? $this->debug['TW'][] = "4 (temp2:{$temp2})"; //add back the leading letter $temp2 = $start_letter . $temp2; $this->debug['TW'][] = "5 (temp2:{$temp2})"; // now drop any repeated characters (AA becomes A, BB or BBB becomes B, etc.) for ($i = 0; $i <= strlen($temp2); $i++) { $next_char = substr($temp2, $i, 1); if ($i == 0) { $result2 = $next_char; } elseif ($next_char == substr($result2, -1)) { } else { $result2 = $result2 . $next_char; } } $this->debug['TW'][] = "6 (result2:{$result2}) (temp2:{$temp2})"; if (strlen($result2) > 4 && $strip_ending) { $this->debug['TW'][] = "7 (result2:{$result2})"; // deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os) // at end of string or word: translate all to -a if (substr($result2, -2) == 'IS') { $result2 = preg_replace('/IS$/', 'A', $result2); } if (substr($result2, -2) == 'IM') { $result2 = preg_replace('/IM$/', 'A', $result2); } if (substr($result2, -2) == 'AS') { $result2 = preg_replace('/AS$/', 'A', $result2); } $this->debug['TW'][] = "7a (result2:{$result2})"; } $this->debug['TW'][] = "Return: ({$result2})"; $this->output = $result2; return $this->output; } // End else }
// database connection $db = select_source($table_name); // checks for csv file and creates tables with the postfix $postfix = '_' . $table_name; if (file_exists('../authorities/' . $sourcefile)) { createTables($postfix, $db); // import data from the csv $first = true; $sp_index = 1; $handle = fopen('../authorities/' . $sourcefile, "r"); while (($data = fgetcsv($handle, 1000, ",")) !== FALSE) { if ($first) { $first = false; continue; } $norm = new Normalize(); $nm = new NearMatch(); $genus_desc = $data[$genus]; $gen_length_desc = strlen($genus_desc); # $genus_array = json_decode(file_get_contents(TAXAMATCH_URL . '?cmd=normalize&str=' . $genus_desc),true); # $search_genus_name_desc = $genus_array['data']; $search_genus_name_desc = $norm->normalize($genus_desc); # $near_match_genus_array = json_decode(file_get_contents(TAXAMATCH_URL . '?cmd=near_match&str=' . $genus_desc),true); # $near_match_genus_desc = $near_match_genus_array['data']; $near_match_genus_desc = $nm->near_match($genus_desc); $species_desc = $data[$species]; $sp_length_desc = strlen($species_desc); # $species_array = json_decode(file_get_contents(TAXAMATCH_URL . '?cmd=normalize&str=' . $species_desc),true); # $search_species_name_desc = $species_array['data']; $search_species_name_desc = $norm->normalize($species_desc); # $near_match_species_array = json_decode(file_get_contents(TAXAMATCH_URL . '?cmd=near_match&str=' . $species_desc),true);
public static function remove_duplicate_authors($author_words1, $author_words2) { $unique_authors1 = $author_words1; $unique_authors2 = $author_words2; //print_r($author_words1); //print_r($author_words2); foreach ($author_words1 as $key1 => $author1) { $author1_matches = false; $author1 = Normalize::normalize_author_string($author1); foreach ($author_words2 as $key2 => $author2) { $author2_matches = false; $author2 = Normalize::normalize_author_string($author2); if ($author1 == $author2) { //echo '$1'; $author1_matches = true; $author2_matches = true; } elseif (preg_match("/^" . preg_quote($author1, "/") . "/i", $author2)) { //echo '$2'; $author1_matches = true; } elseif (preg_match("/^" . preg_quote($author2, "/") . "/i", $author1)) { //echo '$3'; $author2_matches = true; } // equal or one is contained in the other, so consider it a match for both terms if (strlen($author1) >= 3 && $author1_matches || strlen($author2) >= 3 && $author2_matches || $author1 == $author2) { //echo '$4'; unset($unique_authors1[$key1]); unset($unique_authors2[$key2]); } elseif ($author1_matches) { //echo '$5'; // author1 was abbreviation of author2 unset($unique_authors1[$key1]); } elseif ($author2_matches) { //echo '$6'; // author1 was abbreviation of author2 unset($unique_authors2[$key2]); } else { //echo '$7'; // no match or abbreviation so try a fuzzy match // $max_length = max(strlen($author1), strlen($author2)); // $lev = levenshtein($author1, $author2); // if(($lev/$max_length) <= .167) $match = self::match_author_words($author1, $author2); if ($match['match']) { //echo '$8'; unset($unique_authors1[$key1]); unset($unique_authors2[$key2]); } } } reset($author_words2); } return array($unique_authors1, $unique_authors2); }
require_once 'classes/class.misc.php'; switch ($cmd) { case 'normalize': require_once 'classes/class.normalize.php'; $norm = new Normalize(); $norm->set('debug_flag', $_REQUEST['debug']); $data = $norm->normalize($str); if ($output == 'xml') { $data = $norm->getXML(); } $debug = $norm->debug; break; case 'normalize_auth': require_once 'classes/class.normalize.php'; $db = select_source($source, $classification); $norm = new Normalize($db); $norm->set('post_fix', '_' . $source); $norm->set('source', $source); $data = $norm->normalize_auth($str); if ($output == 'xml') { $data = $norm->getXML(); } $debug = $norm->debug; break; case 'treat_word': require_once 'classes/class.nearmatch.php'; $nm = new NearMatch(); $strip_ending = $strip_ending == '' ? 0 : $strip_ending; $normalize = $normalize == '' ? 0 : $normalize; $data = $nm->treat_word($str, $strip_ending, $normalize); if ($output == 'xml') {
/** * Sets a margin for the watermark. Useful if you're using positioning ala CSS. * * @param mixed $x Can be just x position or an array containing both params. * @param int $y Y position. * @return Watermark */ public function setMargin($x, $y = null) { $this->margin = Normalize::margin($x, $y); return $this; }