/** * Function : process * Purpose: Perform exact and fuzzy matching on a species name, or single genus name * Input: - genus, genus+species, or genus+species+authority (in this version), as "searchtxt" * - "search_mode" to control search mode: currently normal (default) / rapid / no_shaping * - "debug" - print internal parameters used if not null * Outputs: list of genera and species that match (or near match) input terms, with associated * ancillary info as desired * Remarks: * (1) This demo version is configured to access base data in three tables: * - genlist_test1 (genus info); primary key (PK) is genus_id * - splist_test1 (species info); PK is species_id, has genus_id as foreign key (FK) * (= link to relevant row in genus table) * - auth_abbrev_test1 (authority abbreviations - required by subsidiary function * "normalize_auth". Refer README file for relevant minimum table definitions. * If authority comparisons are not required, calls to "normalize_auth" can be disabled and * relevant function commented out, removing need for third table. * (In a production system, table and column names can be varied as desired so long as * code is altered at relevant points, also could be re-configured to hold all genus+species info together in a single table with minor re-write). * (2) Writes to and reads back from pre-defined global temporary tables * "genus_id_matches" and "species_id_matches", new instances of these are automatically * created for each session (i.e., do not need clearing at procedure end). Refer * README file for relevant table definitions. * (3) When result shaping is on in this version, a relevant message displayed as required * for developer feedback, if more distant results are being masked (in producton version, * possibly would not do this) * (4) Requires the following subsidiary functions (supplied elsewhere in this package): * - normalize * - normalize_auth * - reduce_spaces * - ngram * - compare_auth * - near_match * - mdld * (5) Accepts "+" as input separator in place of space (e.g. "H**o+sapiens"), e.g. for calling * via a HTTP GET request as needed. * @param string $searchtxt : genus, genus+species, or genus+species+authority * @param string $search_mode : normal (default) / rapid / no_shaping * @param boolean $cache * @return boolean */ public function process($searchtxt, $search_mode = 'normal', $cache = false) { $this->input = $searchtxt; $this->search_mode = $search_mode; $this->searchtxt = $searchtxt; $this->debug['process'][] = "1 (searchtxt:{$searchtxt}) (search_mode:{$search_mode})"; $this->this_search_family = ''; $this->this_search_genus = ''; $this->this_search_species = ''; $this->this_authority = ''; $this->this_authorities = array(); $this->this_search_infra1 = ''; $this->this_search_infra2 = ''; $this->this_search_rank1 = ''; $this->this_search_rank2 = ''; $this->this_start_string = ''; $this->this_cleaned_txt = ''; $this->this_family_string = ''; $this->this_family_unmatched = ''; $this->this_status_string = ''; $text_str = $searchtxt; // accept "+" as separator if supplied, tranform to space if (strpos($text_str, '+') !== false) { $text_str = str_replace('+', ' ', $text_str); } #$replace=array("%", "<", "{", "}", "&", "_", "\t"); $replace = array("\t"); $text_str = str_replace($replace, ' ', $text_str); if (strpos($text_str, ' ') !== false) { $text_str = preg_replace("/ {2,}/", ' ', $text_str); } $text_str = trim($text_str); $this->debug['process'][] = "1a (text_str:{$text_str})"; if (is_null($text_str) || $text_str == '') { $this->debug['process'][] = "2 Return(false)"; return false; } if (preg_match('/^[^[:alpha:]]+/u', $text_str, $start_matches)) { $text_str = str_replace($start_matches[0], '', $text_str); $this->this_start_string = $start_matches[0]; } if (preg_match("/(?:(?:\\s|^)(?:\\-?cf\\.?|vel\\.? sp\\.? aff\\.?|\\-?aff\\.?)(?:\\s|\$))|(?:\\?+)/i", $text_str, $anno_matches)) { $text_str = trim(str_replace($anno_matches[0], ' ', $text_str)); $this->this_status_string = trim($anno_matches[0]); } $text_str = str_replace(' -', '-', $text_str); $text_str = str_replace('- ', '-', $text_str); $this->this_preprocessed_txt = $text_str; $text_str = preg_replace("/(?<=\\s|^)(?:\\S*[^[:alpha:][:space:]])?(indeterminad[ao]|undetermined|unknown|indet\\.?|sp\\.?\\s+nov\\.?|sp\\.?)(?:[^[:alpha:][:space:]]\\S*)?(?=\\s|\$)/i", ' ', $text_str); if (strpos($text_str, ' ') !== false) { $text_str = preg_replace("/ {2,}/", ' ', $text_str); } $text_str = trim($text_str); if (preg_match('/^(((?:[[:alpha:]]+aceae)|Cruciferae|Guttiferae|Umbelliferae|Compositae|Leguminosae|Palmae|Labiatae|Gramineae|Mimosoideae|Papilionoideae|Caesalpinioideae|fam(?:ily)?)((?:[^[:alpha:][:space:]]\\S*)?))(?=\\s+|$)/i', $text_str, $fam_matches)) { $text_str = trim(str_replace($fam_matches[0], '', $text_str)); $this->this_family_string = $fam_matches[1]; #$this->this_search_family=$fam_matches[2]; $this->this_search_family = mb_strtoupper(mb_substr($fam_matches[2], 0, 1)) . mb_strtolower(mb_substr($fam_matches[2], 1)); $this->this_family_unmatched = $fam_matches[3]; if (preg_match("/^fam(ily)?\$/i", $this->this_search_family)) { $this->this_family_unmatched = $this->this_search_family . $this->this_family_unmatched; $this->this_search_family = ''; } if (!$this->parse_only && $this->this_search_family) { $searchFamilyName = $this->db->searchFamilyName($this->this_search_family); if (isset($searchFamilyName)) { foreach ($searchFamilyName as $returnedFamilyName) { $this->saveFamilyMatches($returnedFamilyName->nameID, $this->this_search_family, 0, 'Y'); } } $nm = new NearMatch(); $this_near_match_family = $nm->near_match($this->this_search_family); $this_family_start = substr($this->this_search_family, 0, 3); $this_family_length = strlen($this->this_search_family); $family_res = $this->db->family_cur($this->search_mode, $this_near_match_family, $this_family_length, $this_family_start); if (count($family_res)) { foreach ($family_res as $drec) { $family_match = $this->match_family($this->this_search_family, $drec->search_family_name); if ($family_match['match']) { $phonetic_flag = $family_match['phonetic_match'] ? 'Y' : null; $this->saveFamilyMatches($drec->family_id, $drec->family, $family_match['edit_distance'], $phonetic_flag); } } // end foreach } } } if ($text_str == 'exit' || $text_str == 'end' || $text_str == 'q' || $text_str == '.') { return true; } //unhyphened trinormial if (preg_match('/^([[:alpha:]]+) ([[:alpha:]]+)[\\.\\s]([[:alpha:]]+)(.*)/', $text_str, $matches)) { $specific_epithet_str = "{$matches['2']}-{$matches['3']}"; $check_res = $this->db->checkSpecificEpithet(array($specific_epithet_str)); foreach ($check_res as $ck) { if ($ck->count > 0 && mb_strtolower($ck->specificEpithet) == mb_strtolower($specific_epithet_str)) { $text_str = str_ireplace_first("{$matches['2']} {$matches['3']}", "{$matches['2']}-{$matches['3']}", $text_str); $this->this_preprocessed_txt = str_ireplace_first("{$matches['2']} {$matches['3']}", "{$matches['2']}-{$matches['3']}", $this->this_preprocessed_txt); } } } $this->this_cleaned_txt = $text_str; $token = explode(" ", $text_str); for ($i = 0; $i < count($token); $i++) { if (preg_match('/^[[:alpha:]]+\\.?$/u', $token[$i])) { if ($i == 0) { $token[$i] = mb_strtoupper(mb_substr($token[$i], 0, 1)) . mb_strtolower(mb_substr($token[$i], 1)); } elseif (mb_strtoupper($token[$i]) == $token[$i]) { $token[$i] = mb_strtolower($token[$i]); } else { break; } } } $text_str = implode(" ", $token); // Clearing the temporary tables //$this->db->clearTempTables(); // includes stripping of presumed non-relevant content including subgenera, comments, cf's, aff's, etc... to // Normalizing the search text $n = new Normalize($this->db); $this->debug['process'][] = "3 (text_str:{$text_str})"; if (!$this->chop_overload) { // leave presumed genus + species + authority (in this instance), with genus and species in uppercase $splitter = new Splitter($n, $text_str); $this->this_search_genus = $this_search_genus = $splitter->get('genus'); $this->this_search_species = $this_search_species = $splitter->get('species'); $this->this_authorities = $splitter->get('authors'); $this->this_authority = $this_authority = end($this->this_authorities); if (preg_match("/^gen(us)?\$/i", $this->this_search_genus)) { $this->this_search_genus = ''; } if (preg_match("/^sp(p|ecies)?\$/i", $this->this_search_species)) { $this->this_search_species = ''; } $infraspecies = $splitter->get('infraspecies'); if (isset($infraspecies)) { if (isset($infraspecies[0])) { if ($infraspecies[0][0] != 'n/a') { $this->this_search_rank1 = $infraspecies[0][0]; } $this->this_search_infra1 = $infraspecies[0][1]; } if (isset($infraspecies[1])) { if ($infraspecies[1][0] != 'n/a') { $this->this_search_rank2 = $infraspecies[1][0]; } $this->this_search_infra2 = $infraspecies[1][1]; } } if (NAME_PARSER == 'gni') { $this->gni_parser_result = $splitter->parsed_response; } } if ($this->parse_only) { return true; } // cache_flag switch detemines if caching is allowed for the source if ($this->cache_flag == true) { if ($this_search_genus != '' && $this_search_species != '' && $this_authority != '') { $cache_key = $this_search_genus . '-' . $this_search_species . '-' . $this_authority . '_' . $search_mode; $cache_path = $this->cache_path . $this->db->source . "/authority/"; } else { if ($this_search_genus != '' && $this_search_species != '') { $cache_key = $this_search_genus . '-' . $this_search_species . '_' . $search_mode; $cache_path = $this->cache_path . $this->db->source . "/species/"; } else { if ($this_search_genus != '') { $cache_key = $this_search_genus . '_' . $search_mode; $cache_path = $this->cache_path . $this->db->source . "/genus/"; } } } $this->mkdir_recursive($cache_path); $this->_cache = new Cache($cache_path); $this->_cache->setKey($cache_key); } $cache_loop_flag = false; if ($cache == true && $this->cache_flag == true) { if ($this->_cache->cache_exists()) { $cache_loop_flag = true; } } if (!$cache_loop_flag) { $search_str = $this->this_search_genus; if ($this->this_search_species) { $search_str .= ' ' . $this_search_species; } if ($this->this_search_infra1) { if ($this->this_search_rank1 != '') { $search_str .= ' ' . $this->this_search_rank1; } $search_str .= ' ' . $this->this_search_infra1; } if ($this->this_search_infra2) { if ($this->this_search_rank2 != '') { $search_str .= ' ' . $this->this_search_rank2; } $search_str .= ' ' . $this->this_search_infra2; } $searchScientificName = $this->db->searchScientificName(array($text_str, $search_str)); if (isset($searchScientificName)) { $has_match = 0; foreach ($searchScientificName as $returnedScientificName) { if ($returnedScientificName->specificEpithet != '') { $has_match = 1; if ($returnedScientificName->nameRank != 'species' && $returnedScientificName->nameRank != 'nothospecies') { if ($returnedScientificName->infraspecificEpithet2 || $this->this_search_infra2) { $this->saveInfra2Matches($returnedScientificName->nameID, $returnedScientificName->scientificName, 0, 0, 0, 0, 0, 'Y'); } elseif ($returnedScientificName->infraspecificEpithet || $this->this_search_infra1) { $this->saveInfra1Matches($returnedScientificName->nameID, $returnedScientificName->scientificName, 0, 0, 0, 0, 'Y'); } } else { $this->saveSpeciesMatches($returnedScientificName->nameID, $returnedScientificName->scientificName, 0, 0, 0, 'Y'); } } elseif ($returnedScientificName->genus != '') { $has_match = 1; $this->saveGenusMatches($returnedScientificName->nameID, $returnedScientificName->genus, 0, 'Y'); } } if ($has_match) { return true; } } $this->debug['process'][] = "3a (this_search_genus:{$this_search_genus}) (this_search_species:{$this_search_species}) (this_authority:{$this_authority})"; $nm = new NearMatch(); $this_near_match_genus = $nm->near_match($this_search_genus); $this_near_match_species = ''; $this->debug['process'][] = "3b (this_near_match_genus:{$this_near_match_genus})"; //TODO refactor inside of a method $this_genus_start = substr($this_search_genus, 0, 3); $this_genus_end = substr($this_search_genus, -3); $this_genus_length = strlen($this_search_genus); //TODO_END $this->debug['process'][] = "3c (this_search_genus,{$this_search_genus}) (this_genus_start:{$this_genus_start}) (this_genus_end:{$this_genus_end}) (this_genus_length:{$this_genus_length})"; if ($this_search_species != '') { $this_near_match_species = $nm->near_match($this_search_species, 'epithet_only'); $this_species_length = strlen($this_search_species); $this->debug['process'][] = "4 (this_search_species:{$this_search_species}) (this_near_match_species:{$this_near_match_species}) (this_species_length:{$this_species_length})"; } // now look for exact or near matches on genus first select candidate genera for edit distance (MDLD) test // for drec in genus_cur loop -- includes the genus pre-filter (main portion) $genus_res = $this->db->genus_cur3($this->search_mode, $this_near_match_genus, $this_near_match_species, $this_genus_length, $this_genus_start, $this_genus_end); # $this->debug['process'][] = array("5 (genus_res)" => $genus_res); $genus_matches = array(); if (count($genus_res)) { // EJS -- attempt to reduce the amount of species_cur // this will be the naive approach foreach ($genus_res as $drec) { $genus_match = $this->match_genera($this_search_genus, $drec->search_genus_name); if ($genus_match['match']) { // don't include a genus already in the array if (!array_key_exists($drec->genus_id, $genus_matches)) { $phonetic_flag = $genus_match['phonetic_match'] ? 'Y' : null; $this->saveGenusMatches($drec->genus_id, $drec->genus, $genus_match['edit_distance'], $phonetic_flag); $this->genera_tested++; } $genus_matches[$drec->genus_id] = $genus_match; } } // end foreach } $species_matches = array(); if ($this_search_species != '' && count($genus_matches)) { $species_res = $this->db->species_cur_in2(array_keys($genus_matches), $this_species_length); if (isset($species_res)) { foreach ($species_res as $drec) { $species_epithets_match = $this->match_species_epithets($this_search_species, $drec->search_species_name); $genus_match = $genus_matches[$drec->genus_id]; $binomials_match = $this->match_matches(array($genus_match, $species_epithets_match)); if ($binomials_match['match']) { if (!array_key_exists($drec->species_id, $species_matches)) { $binomial_phonetic_flag = $binomials_match['phonetic_match'] ? 'Y' : null; $this->saveSpeciesMatches($drec->species_id, $drec->genus_species, $genus_match['edit_distance'], $species_epithets_match['edit_distance'], $binomials_match['edit_distance'], $binomial_phonetic_flag); $this->species_tested++; } $species_epithets_match['genus_match'] = $genus_match; $species_matches[$drec->species_id] = $species_epithets_match; } } // EJS -- end } } $infra1_matches = array(); if ($this->this_search_infra1 != '' && count($species_matches)) { $this_infra1 = $this->this_search_infra1; $this_rank1 = $this->this_search_rank1; $this_infra1_length = strlen($this_infra1); $infra1_res = $this->db->infra1_cur_in(array_keys($species_matches), $this_infra1_length); if (isset($infra1_res)) { foreach ($infra1_res as $drec) { $infra1_match = $this->match_species_epithets($this_infra1, $drec->search_infra1_name); $species_match = $species_matches[$drec->species_id]; $genus_match = $species_match["genus_match"]; $binomials_match = $this->match_matches(array($genus_match, $species_match, $infra1_match)); if ($binomials_match['match']) { if (!array_key_exists($drec->infra1_id, $infra1_matches)) { $binomial_phonetic_flag = $binomials_match['phonetic_match'] ? 'Y' : null; $this->saveInfra1Matches($drec->infra1_id, $drec->species_infra1, $genus_match['edit_distance'], $species_match['edit_distance'], $infra1_match['edit_distance'], $binomials_match['edit_distance'], $binomial_phonetic_flag); } $infra1_match["species_match"] = $species_match; $infra1_matches[$drec->infra1_id] = $infra1_match; } } // EJS -- end } } $infra2_matches = array(); if ($this->this_search_infra2 != '' && count($infra1_matches)) { $this_infra2 = $this->this_search_infra2; $this_rank2 = $this->this_search_rank2; $this_infra2_length = strlen($this_infra2); $infra2_res = $this->db->infra2_cur_in(array_keys($species_matches), $this_infra2_length); if (isset($infra2_res)) { foreach ($infra2_res as $drec) { $infra2_match = $this->match_species_epithets($this_infra2, $drec->search_infra2_name); $infra1_match = $infra1_matches[$drec->infra1_id]; $species_match = $infra1_match['species_match']; $genus_match = $species_match["genus_match"]; $binomials_match = $this->match_matches(array($genus_match, $species_match, $infra1_match, $infra2_match)); if ($binomials_match['match']) { if (!array_key_exists($drec->infra2_id, $infra2_matches)) { $binomial_phonetic_flag = $binomials_match['phonetic_match'] ? 'Y' : null; $this->saveInfra2Matches($drec->infra1_id, $drec->species_infra1, $genus_match['edit_distance'], $species_match['edit_distance'], $infra2_match['edit_distance'], $infra1_match['edit_distance'], $binomials_match['edit_distance'], $binomial_phonetic_flag); } $infra2_match["infra1_match"] = $infra1_match; $infra2_matches[$drec->infra2_id] = $infra2_match; } } // EJS -- end } } } // End Cache Loop Flag return true; }
$near_match_genus = $nm->near_match($genus_desc); $query = sprintf("INSERT INTO `genlist%s` (`GENUS_ID`, `GENUS`, `AUTHORITY`, `GEN_LENGTH`, `NEAR_MATCH_GENUS`, `SEARCH_GENUS_NAME`) VALUES ('%s','%s','%s',%s,'%s','%s')", mysql_escape_string($postfix), mysql_escape_string($genus_id_desc), mysql_escape_string($genus_desc), mysql_escape_string($authority_desc), mysql_escape_string($genus_length), mysql_escape_string($near_match_genus), mysql_escape_string($search_genus_name)); $db->query($query); } elseif (trim($data[3]) == '') { // Used to slow down the script for shared hosted sites usleep(20000); // Create Species $species_id_desc = $data[$species_id]; $species_desc = $data[$species]; $species_length = strlen($species_desc); $genus_id_desc = $data[1]; $genus_desc = $master[$data[1]]; $norm = new Normalize(); $nm = new NearMatch(); $search_species_name = $norm->normalize($species_desc); $near_match_species = $nm->near_match($species_desc); $authority_desc = $data[$authority]; $query = sprintf("INSERT INTO `splist%s` (`SPECIES_ID`, `GENUS_ORIG`, `SPECIES`, `GENUS_ID`, `AUTHORITY`, `SP_LENGTH`, `NEAR_MATCH_SPECIES`, `SEARCH_SPECIES_NAME`) VALUES ('%s','%s','%s','%s','%s',%s,'%s','%s') ", mysql_escape_string($postfix), mysql_escape_string($species_id_desc), mysql_escape_string($genus_desc), mysql_escape_string($species_desc), mysql_escape_string($genus_id_desc), mysql_escape_string($authority_desc), mysql_escape_string($species_length), mysql_escape_string($near_match_species), mysql_escape_string($search_species_name)); $db->query($query); } unset($query); } // end while fclose($handle); print round(memory_get_usage() * 0.0009) . "KB - Final Memory Used<br>"; } } } else { print ' A valid name parameter has to be supplied.'; } function select_source($source)
public static function match_author_words($author1, $author2) { $match = $phonetic_match = false; $nm = new NearMatch(); $author1_phonetic = $nm->near_match($author1); $author2_phonetic = $nm->near_match($author2); $author1_length = strlen($author1); $author2_length = strlen($author2); $ed = DamerauLevenshteinMod::distance($author1, $author2, 2, 3); // add the author post-filter // min. 51% "good" chars // first char must match for ED 2+ if ($ed <= 3 && min($author1_length, $author2_length) > $ed * 2 && ($ed < 2 || substr($author1, 0, 1) == substr($author2, 0, 1)) || $author1_phonetic == $author2_phonetic) { $match = true; if ($author1_phonetic == $author2_phonetic) { $phonetic_match = true; } } return array('match' => $match, 'phonetic_match' => $phonetic_match, 'edit_distance' => $ed); }
$handle = fopen('../authorities/' . $sourcefile, "r"); while (($data = fgetcsv($handle, 1000, ",")) !== FALSE) { if ($first) { $first = false; continue; } $norm = new Normalize(); $nm = new NearMatch(); $genus_desc = $data[$genus]; $gen_length_desc = strlen($genus_desc); # $genus_array = json_decode(file_get_contents(TAXAMATCH_URL . '?cmd=normalize&str=' . $genus_desc),true); # $search_genus_name_desc = $genus_array['data']; $search_genus_name_desc = $norm->normalize($genus_desc); # $near_match_genus_array = json_decode(file_get_contents(TAXAMATCH_URL . '?cmd=near_match&str=' . $genus_desc),true); # $near_match_genus_desc = $near_match_genus_array['data']; $near_match_genus_desc = $nm->near_match($genus_desc); $species_desc = $data[$species]; $sp_length_desc = strlen($species_desc); # $species_array = json_decode(file_get_contents(TAXAMATCH_URL . '?cmd=normalize&str=' . $species_desc),true); # $search_species_name_desc = $species_array['data']; $search_species_name_desc = $norm->normalize($species_desc); # $near_match_species_array = json_decode(file_get_contents(TAXAMATCH_URL . '?cmd=near_match&str=' . $species_desc),true); # $near_match_species_desc = $near_match_species_array['data']; $near_match_species_desc = $nm->near_match($species_desc); if ($authority_abbr) { $authority_abbr_desc = $data[$authority_abbr]; $authority_desc = $data[$authority]; $query = sprintf(" INSERT INTO auth_abbrev%s (`AUTH_ABBR`, `AUTH_FULL`) VALUES ('%s','%s') ", mysql_escape_string($postfix), mysql_escape_string($authority_abbr_desc), mysql_escape_string($authority_desc)); $db->query($query); } else { $authority_desc = $data[$authority];
break; case 'treat_word': require_once 'classes/class.nearmatch.php'; $nm = new NearMatch(); $strip_ending = $strip_ending == '' ? 0 : $strip_ending; $normalize = $normalize == '' ? 0 : $normalize; $data = $nm->treat_word($str, $strip_ending, $normalize); if ($output == 'xml') { $data = $nm->getXML(); } $debug = $nm->debug; break; case 'near_match': require_once 'classes/class.nearmatch.php'; $nm = new NearMatch(); $data = $nm->near_match($str); if ($output == 'xml') { $data = $nm->getXML(); } $debug = $nm->debug; break; case 'compare_auth': require_once 'classes/class.taxamatch.php'; $tm = new Taxamatch(); $data = $tm->compare_auth($str, $str2); if ($output == 'xml') { $data = $tm->getXML(); } $debug = $tm->debug; break; case 'ngram':