示例#1
0
function normalize($string)
{
    require_once 'classes/class.normalize.php';
    $norm = new Normalize();
    $norm->set('debug_flag', false);
    $data = $norm->normalize($string);
    return $data;
}
示例#2
0
 /**
  * Function: compare_auth
  * Purpose: Compares two authority strings
  * Author: Tony Rees (Tony.Rees@csiro.au)
  * Date created: March 2008
  * Inputs: authority string 1 as auth1, authority string 1 as auth2
  * Outputs: Numeric similarity value of the 2 strings using weighted n-gram analysis,
  *    on 0-1 scale (1 = identical - typically after normalization; 0 = no similarity)
  * Remarks:
  *   (1) Invokes function "normalize_auth" on both strings, to compare only normalized
  *         versions of the same
  *   (2) Returns blend of 2/3 bigram, 1/3 trigram similarity (bigrams better correspond
  *         to intuitivesimilarity, however are insensitive to word order, i.e. "Smith et
  *         Jones" = "Jones et Smith" without some trigram contribution)
  *   (3) Returns blend of 50% similarity with, and 50% without, stripping of diacritical
  *         marks - so that the contribution of the latter is reduced but not eliminated
  *   (4) Is case insensitive (i.e. "de Saedeleer" = "De Saedeleer", etc.)
  *   (5) Threshold between low / possible / good match is in the area of
  *         0-0.3 / 0.3-0.5 / 0.5+.
  * @param string $auth1 : authority string 1
  * @param string $auth2 : authority string 2
  * @return number : between 0 - 1 : (1 = identical - typically after normalization; 0 = no similarity)
  */
 public function compare_auth($auth1 = NULL, $auth2 = NULL)
 {
     if ($auth1 == NULL || $auth2 == NULL) {
         return NULL;
     } else {
         $this->input = array($auth1, $auth2);
         $this->debug['compare_auth'][] = "Args: (auth1:{$auth1}) (auth2:{$auth2})";
         $n = new Normalize($this->db);
         $new_auth1 = $n->normalize_auth($auth1);
         $new_auth2 = $n->normalize_auth($auth2);
         $this->debug['compare_auth'][] = "1 (new_auth1:{$new_auth1}) (new_auth2:{$new_auth2})";
         if ($new_auth1 == $new_auth2) {
             $this_auth_match = 1;
             $this->debug['compare_auth'][] = "2a (this_auth_match:{$this_auth_match})";
         } else {
             // create second versions without diacritical marks
             #$new_auth1b = $this->translate( $new_auth1, '¡…Õ”⁄¿»Ã“Ÿ¬ Œ‘€ƒÀœ÷‹√—’≈«ÿ', 'AEIOUAEIOUAEIOUAEIOUANOACO');
             #$new_auth2b = $this->translate( $new_auth2, '¡…Õ”⁄¿»Ã“Ÿ¬ Œ‘€ƒÀœ÷‹√—’≈«ÿ', 'AEIOUAEIOUAEIOUAEIOUANOACO');
             $new_auth1b = $n->utf8_to_ascii($new_auth1);
             $new_auth2b = $n->utf8_to_ascii($new_auth2);
             // weighted ngram comparison, use 67% n=2, 33% n=3
             // use mean of versions with and without diacritical marks (to lessen their effect by 50%)
             $temp_auth_match1 = (2 * $this->ngram($new_auth1, $new_auth2, 2) + $this->ngram($new_auth1, $new_auth2, 3)) / 3;
             $temp_auth_match2 = 0;
             if ($new_auth1 == $new_auth1b && $new_auth2 == $new_auth2b) {
                 $temp_auth_match2 = $temp_auth_match1;
             } else {
                 $temp_auth_match2 = (2 * $this->ngram($new_auth1b, $new_auth2b, 2) + $this->ngram($new_auth1b, $new_auth2b, 3)) / 3;
             }
             $this->debug['compare_auth'][] = '2b (temp_auth_match1:$temp_auth_match1) (temp_auth_match2:$temp_auth_match2)';
             $this_auth_match = ($temp_auth_match1 + $temp_auth_match2) / 2;
             $this->debug['compare_auth'][] = '2c (this_auth_match:$this_auth_match)';
         }
     }
     $this_auth_match = round($this_auth_match, 4);
     $this->debug['compare_auth'][] = "Return: {$this_auth_match}";
     /*			$this->output = $this_auth_match;
     	    return( $this->output );*/
     return $this_auth_match;
 }
                    $norm = new Normalize();
                    $nm = new NearMatch();
                    $search_genus_name = $norm->normalize($genus_desc);
                    $near_match_genus = $nm->near_match($genus_desc);
                    $query = sprintf("INSERT INTO `genlist%s` (`GENUS_ID`, `GENUS`, `AUTHORITY`, `GEN_LENGTH`, `NEAR_MATCH_GENUS`, `SEARCH_GENUS_NAME`) VALUES  ('%s','%s','%s',%s,'%s','%s')", mysql_escape_string($postfix), mysql_escape_string($genus_id_desc), mysql_escape_string($genus_desc), mysql_escape_string($authority_desc), mysql_escape_string($genus_length), mysql_escape_string($near_match_genus), mysql_escape_string($search_genus_name));
                    $db->query($query);
                } elseif (trim($data[3]) == '') {
                    // Used to slow down the script for shared hosted sites
                    usleep(20000);
                    // Create Species
                    $species_id_desc = $data[$species_id];
                    $species_desc = $data[$species];
                    $species_length = strlen($species_desc);
                    $genus_id_desc = $data[1];
                    $genus_desc = $master[$data[1]];
                    $norm = new Normalize();
                    $nm = new NearMatch();
                    $search_species_name = $norm->normalize($species_desc);
                    $near_match_species = $nm->near_match($species_desc);
                    $authority_desc = $data[$authority];
                    $query = sprintf("INSERT INTO `splist%s` (`SPECIES_ID`, `GENUS_ORIG`, `SPECIES`, `GENUS_ID`, `AUTHORITY`, `SP_LENGTH`, `NEAR_MATCH_SPECIES`, `SEARCH_SPECIES_NAME`) VALUES ('%s','%s','%s','%s','%s',%s,'%s','%s') ", mysql_escape_string($postfix), mysql_escape_string($species_id_desc), mysql_escape_string($genus_desc), mysql_escape_string($species_desc), mysql_escape_string($genus_id_desc), mysql_escape_string($authority_desc), mysql_escape_string($species_length), mysql_escape_string($near_match_species), mysql_escape_string($search_species_name));
                    $db->query($query);
                }
                unset($query);
            }
            // end while
            fclose($handle);
            print round(memory_get_usage() * 0.0009) . "KB - Final Memory Used<br>";
        }
    }
} else {
示例#4
0
 /**
  * Function to treat the word
  * @param string $str2
  * @param integer $strip_ending
  * @param integer $normalize : 1 -> normalize the word , 0 -> no normalization action
  * @return string
  */
 public function treat_word($str2, $strip_ending = 0, $normalize = 1)
 {
     $temp2 = '';
     $start_letter = '';
     $next_char = '';
     $result2 = '';
     $this->input = $str2;
     $this->output = '';
     if ($str2 == NULL || trim($str2) == '') {
         return '';
     } else {
         if ($normalize) {
             $n = new Normalize();
             $temp2 = $n->normalize($str2);
             $this->debug['Normalize'][] = $n->debug;
         } else {
             $temp2 = $str2;
         }
         $this->debug['TW'][] = "1 (temp2:{$temp2})";
         // Do some selective replacement on the leading letter/s only: ('soundalikes')
         if (preg_match('/^AE/', $temp2)) {
             $temp2 = preg_replace('/^AE/', 'E', $temp2);
         } elseif (preg_match('/^CN/', $temp2)) {
             $temp2 = preg_replace('/^CN/', 'N', $temp2);
         } elseif (preg_match('/^CT/', $temp2)) {
             $temp2 = preg_replace('/^CT/', 'Z', $temp2);
         } elseif (preg_match('/^CZ/', $temp2)) {
             $temp2 = preg_replace('/^CZ/', 'V', $temp2);
         } elseif (preg_match('/^DJ/', $temp2)) {
             $temp2 = preg_replace('/^DJ/', 'J', $temp2);
         } elseif (preg_match('/^EA/', $temp2)) {
             $temp2 = preg_replace('/^EA/', 'E', $temp2);
         } elseif (preg_match('/^EU/', $temp2)) {
             $temp2 = preg_replace('/^EU/', 'U', $temp2);
         } elseif (preg_match('/^GN/', $temp2)) {
             $temp2 = preg_replace('/^GN/', 'N', $temp2);
         } elseif (preg_match('/^KN/', $temp2)) {
             $temp2 = preg_replace('/^KN/', 'N', $temp2);
         } elseif (preg_match('/^MC/', $temp2)) {
             $temp2 = preg_replace('/^MC/', 'MAC', $temp2);
         } elseif (preg_match('/^MN/', $temp2)) {
             $temp2 = preg_replace('/^MN/', 'N', $temp2);
         } elseif (preg_match('/^OE/', $temp2)) {
             $temp2 = preg_replace('/^OE/', 'E', $temp2);
         } elseif (preg_match('/^QU/', $temp2)) {
             $temp2 = preg_replace('/^QU/', 'Q', $temp2);
         } elseif (preg_match('/^PS/', $temp2)) {
             $temp2 = preg_replace('/^PS/', 'S', $temp2);
         } elseif (preg_match('/^PT/', $temp2)) {
             $temp2 = preg_replace('/^PT/', 'T', $temp2);
         } elseif (preg_match('/^TS/', $temp2)) {
             $temp2 = preg_replace('/^TS/', 'S', $temp2);
         } elseif (preg_match('/^WR/', $temp2)) {
             $temp2 = preg_replace('/^WR/', 'R', $temp2);
         } elseif (preg_match('/^X/', $temp2)) {
             $temp2 = preg_replace('/^X/', 'Z', $temp2);
         } elseif (preg_match('/^ph/', $temp2)) {
             $temp2 = preg_replace('/^ph/', 'f', $temp2);
         }
         $this->debug['TW'][] = "2 (temp2:{$temp2})";
         // Now keep the leading character, then do selected "soundalike" replacements. The
         // following letters are equated: AE, OE, E, U, Y and I; IA and A are equated;
         // K and C;  Z and S; and H is dropped. Also, A and O are equated, MAC and MC are equated, and SC and S.
         $start_letter = substr($temp2, 0, 1);
         // quarantine the leading letter
         $temp2 = substr($temp2, 1);
         // snip off the leading letter
         $this->debug['TW'][] = "3 (start_letter:{$start_letter}) (temp2:{$temp2})";
         // now do the replacements
         $temp2 = str_ireplace('AE', 'I', $temp2);
         $temp2 = str_ireplace('IA', 'A', $temp2);
         $temp2 = str_ireplace('OE', 'I', $temp2);
         $temp2 = str_ireplace('OI', 'A', $temp2);
         $temp2 = str_ireplace('SC', 'S', $temp2);
         $temp2 = str_ireplace('E', 'I', $temp2);
         $temp2 = str_ireplace('O', 'A', $temp2);
         $temp2 = str_ireplace('U', 'I', $temp2);
         $temp2 = str_ireplace('Y', 'I', $temp2);
         $temp2 = str_ireplace('K', 'C', $temp2);
         $temp2 = str_ireplace('Z', 'S', $temp2);
         $temp2 = str_ireplace('H', '', $temp2);
         //        $temp2 = str_ireplace ('io', 'a', $temp2);   // Not used in taxamatch?
         //        $temp2 = str_ireplace ('ou', 'u', $temp2);	// Not used in taxamatch?
         //        $temp2 = str_ireplace ('ph', 'f', $temp2);	// Not used in taxamatch?
         $this->debug['TW'][] = "4 (temp2:{$temp2})";
         //add back the leading letter
         $temp2 = $start_letter . $temp2;
         $this->debug['TW'][] = "5 (temp2:{$temp2})";
         // now drop any repeated characters (AA becomes A, BB or BBB becomes B, etc.)
         for ($i = 0; $i <= strlen($temp2); $i++) {
             $next_char = substr($temp2, $i, 1);
             if ($i == 0) {
                 $result2 = $next_char;
             } elseif ($next_char == substr($result2, -1)) {
             } else {
                 $result2 = $result2 . $next_char;
             }
         }
         $this->debug['TW'][] = "6 (result2:{$result2}) (temp2:{$temp2})";
         if (strlen($result2) > 4 && $strip_ending) {
             $this->debug['TW'][] = "7  (result2:{$result2})";
             // deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os)
             // at end of string or word: translate all to -a
             if (substr($result2, -2) == 'IS') {
                 $result2 = preg_replace('/IS$/', 'A', $result2);
             }
             if (substr($result2, -2) == 'IM') {
                 $result2 = preg_replace('/IM$/', 'A', $result2);
             }
             if (substr($result2, -2) == 'AS') {
                 $result2 = preg_replace('/AS$/', 'A', $result2);
             }
             $this->debug['TW'][] = "7a  (result2:{$result2})";
         }
         $this->debug['TW'][] = "Return: ({$result2})";
         $this->output = $result2;
         return $this->output;
     }
     // End else
 }
示例#5
0
 // database connection
 $db = select_source($table_name);
 // checks for csv file and creates tables with the postfix
 $postfix = '_' . $table_name;
 if (file_exists('../authorities/' . $sourcefile)) {
     createTables($postfix, $db);
     // import data from the csv
     $first = true;
     $sp_index = 1;
     $handle = fopen('../authorities/' . $sourcefile, "r");
     while (($data = fgetcsv($handle, 1000, ",")) !== FALSE) {
         if ($first) {
             $first = false;
             continue;
         }
         $norm = new Normalize();
         $nm = new NearMatch();
         $genus_desc = $data[$genus];
         $gen_length_desc = strlen($genus_desc);
         #					$genus_array = json_decode(file_get_contents(TAXAMATCH_URL . '?cmd=normalize&str=' . $genus_desc),true);
         #					$search_genus_name_desc = $genus_array['data'];
         $search_genus_name_desc = $norm->normalize($genus_desc);
         #					$near_match_genus_array = json_decode(file_get_contents(TAXAMATCH_URL . '?cmd=near_match&str=' . $genus_desc),true);
         #					$near_match_genus_desc = $near_match_genus_array['data'];
         $near_match_genus_desc = $nm->near_match($genus_desc);
         $species_desc = $data[$species];
         $sp_length_desc = strlen($species_desc);
         #					$species_array = json_decode(file_get_contents(TAXAMATCH_URL . '?cmd=normalize&str=' . $species_desc),true);
         #					$search_species_name_desc = $species_array['data'];
         $search_species_name_desc = $norm->normalize($species_desc);
         #					$near_match_species_array = json_decode(file_get_contents(TAXAMATCH_URL . '?cmd=near_match&str=' . $species_desc),true);
示例#6
0
 public static function remove_duplicate_authors($author_words1, $author_words2)
 {
     $unique_authors1 = $author_words1;
     $unique_authors2 = $author_words2;
     //print_r($author_words1);
     //print_r($author_words2);
     foreach ($author_words1 as $key1 => $author1) {
         $author1_matches = false;
         $author1 = Normalize::normalize_author_string($author1);
         foreach ($author_words2 as $key2 => $author2) {
             $author2_matches = false;
             $author2 = Normalize::normalize_author_string($author2);
             if ($author1 == $author2) {
                 //echo '$1';
                 $author1_matches = true;
                 $author2_matches = true;
             } elseif (preg_match("/^" . preg_quote($author1, "/") . "/i", $author2)) {
                 //echo '$2';
                 $author1_matches = true;
             } elseif (preg_match("/^" . preg_quote($author2, "/") . "/i", $author1)) {
                 //echo '$3';
                 $author2_matches = true;
             }
             // equal or one is contained in the other, so consider it a match for both terms
             if (strlen($author1) >= 3 && $author1_matches || strlen($author2) >= 3 && $author2_matches || $author1 == $author2) {
                 //echo '$4';
                 unset($unique_authors1[$key1]);
                 unset($unique_authors2[$key2]);
             } elseif ($author1_matches) {
                 //echo '$5';
                 // author1 was abbreviation of author2
                 unset($unique_authors1[$key1]);
             } elseif ($author2_matches) {
                 //echo '$6';
                 // author1 was abbreviation of author2
                 unset($unique_authors2[$key2]);
             } else {
                 //echo '$7';
                 // no match or abbreviation so try a fuzzy match
                 // $max_length = max(strlen($author1), strlen($author2));
                 // $lev = levenshtein($author1, $author2);
                 // if(($lev/$max_length) <= .167)
                 $match = self::match_author_words($author1, $author2);
                 if ($match['match']) {
                     //echo '$8';
                     unset($unique_authors1[$key1]);
                     unset($unique_authors2[$key2]);
                 }
             }
         }
         reset($author_words2);
     }
     return array($unique_authors1, $unique_authors2);
 }
示例#7
0
require_once 'classes/class.misc.php';
switch ($cmd) {
    case 'normalize':
        require_once 'classes/class.normalize.php';
        $norm = new Normalize();
        $norm->set('debug_flag', $_REQUEST['debug']);
        $data = $norm->normalize($str);
        if ($output == 'xml') {
            $data = $norm->getXML();
        }
        $debug = $norm->debug;
        break;
    case 'normalize_auth':
        require_once 'classes/class.normalize.php';
        $db = select_source($source, $classification);
        $norm = new Normalize($db);
        $norm->set('post_fix', '_' . $source);
        $norm->set('source', $source);
        $data = $norm->normalize_auth($str);
        if ($output == 'xml') {
            $data = $norm->getXML();
        }
        $debug = $norm->debug;
        break;
    case 'treat_word':
        require_once 'classes/class.nearmatch.php';
        $nm = new NearMatch();
        $strip_ending = $strip_ending == '' ? 0 : $strip_ending;
        $normalize = $normalize == '' ? 0 : $normalize;
        $data = $nm->treat_word($str, $strip_ending, $normalize);
        if ($output == 'xml') {
示例#8
0
 /**
  * Sets a margin for the watermark. Useful if you're using positioning ala CSS.
  *
  * @param  mixed  $x  Can be just x position or an array containing both params.
  * @param  int    $y  Y position.
  * @return Watermark
  */
 public function setMargin($x, $y = null)
 {
     $this->margin = Normalize::margin($x, $y);
     return $this;
 }