function nameSimilarity($matched, $name, $type = null) { if ($matched == 'N/A') { return 0; } //$matched_cleaned = canonical_form($matched, true); $matched_cleaned = $matched; if (empty($matched_cleaned)) { return 0; } $score = 3; $parts1 = explode(" ", $matched_cleaned); $parts2 = explode(" ", $name); if (count($parts1) === count($parts2)) { if (levenshtein($matched_cleaned, $name) <= 3 || preg_match('/full/i', $type)) { $score += 0.5; } for ($pidx = 0; $pidx < count($parts1); $pidx++) { if ($parts1[$pidx][0] != $parts2[$pidx][0]) { $score -= 1.5; } } } if (preg_match('/sound|look/i', $type)) { // $score -= 0.05; $score -= levenshtein($name, $matched_cleaned) / 20; } $penalty = 0; if (count($parts1) == count($parts2)) { $penalty = 0; } elseif (count(array_unique($parts1)) != count(array_unique($parts2))) { $penalty = 0.01; } if (count(array_unique($parts1)) > count(array_unique($parts2))) { $penalty += 0.5; // $penalty = $penalty / (3 - count(array_unique(array_slice($parts1, 1)))); } elseif (count(array_unique($parts1)) < count(array_unique($parts2))) { if (count(array_unique($parts1)) < count($parts1)) { $penalty -= 0.0; } else { $penalty -= 0.2; } // $penalty = ($penalty < 0)?0:$penalty; } $score -= 2 * levenshtein($parts1[0], $parts2[0]) / strlen($parts1[0]); $sub_parts1 = array_slice($parts1, 1); $sub_parts2 = array_slice($parts2, 1); $total_err = 0; foreach ($sub_parts2 as $sp2_idx => $sp2) { // $min_err = 999.0; // $min_errs[$sp1] = 999.0; if (is_null(@$min_errs[$sp2])) { $min_errs[$sp2] = 999.0; } foreach ($sub_parts1 as $sp1_idx => $sp1) { if (levenshtein($sp1, $sp2) <= 3 && treat_word($sp1[0]) == treat_word($sp2[0])) { // echo "<xmp>$sp1 $sp2 ". levenshtein($sp1, $sp2) . "</xmp>"; $tmp_err = (double) levenshtein($sp1, $sp2) / (double) strlen($sp1); // $min_err = min($min_err, $tmp_err); $min_errs[$sp2] = min($min_errs[$sp2], $tmp_err); // echo "$sp1, $sp2, $min_err, $total_err, $matched<br/>"; } else { // $min_err = 0.5; if (count($sub_parts1) != count($sub_parts2)) { $factor = count($sub_parts1) + count($sub_parts2) - ($sp1_idx + $sp2_idx + 1); } else { $factor = 1; } /* echo "<xmp>"; var_dump(array($sp2, $sp1, $factor, min($min_errs[$sp2], 1 / $factor))); echo "</xmp>"; //*/ $min_errs[$sp2] = min($min_errs[$sp2], 1 / $factor); /* echo "<xmp>"; var_dump(array($sub_parts1, $sub_parts2)); echo "</xmp>"; //*/ } } } /* echo "<xmp>"; var_dump($min_errs); echo "</xmp>"; //*/ foreach (array_unique($sub_parts2) as $sp2) { $total_err += $min_errs[$sp2]; } // $score -= (($total_err>1.5)?1.5:$total_err); $score -= $total_err; /* echo "<xmp>"; var_dump(array($matched, $penalty, $score, $score - $penalty)); echo "</xmp>"; //*/ return $score - $penalty; }
function queryNames($name, $against, $best, $ep) { if (empty($ep)) { return false; } $ep .= '/select?wt=json&q=*:*'; // $ep = 'http://localhost:8983/solr/taxa/select?wt=json&q=*:*'; // $ep = 'http://140.109.28.72/solr4/taxa/select?wt=json&q=*:*'; extract_results("", "", $reset = true); // mix2; work with latin part b2, c2, and suggestions of latin part b2, c2 $mix2 = array(); $sound_mix2 = array(); $matched = array(); $info = array(); $suggestions = array(); $long_suggestions = array(); $name_cleaned = canonical_form($name, true); $parts = explode(" ", $name_cleaned); $lpa2 = $parts[0]; $lpb2 = @$parts[1]; $lpc2 = @$parts[2]; $spa2 = treat_word($lpa2); $spb2 = treat_word($lpb2); $spc2 = treat_word($lpc2); if (!empty($parts[1])) { $mix2[] = $parts[1]; } else { // return null; return array('N/A' => array('name' => $name, 'name_cleaned' => $name_cleaned, 'matched' => 'N/A', 'matched_clean' => 'N/A', 'accepted_namecode' => array(), 'namecode' => array(), 'source' => array(), 'url_id' => array(), 'a_url_id' => array(), 'kingdom' => array(), 'phylum' => array(), 'class' => array(), 'order' => array(), 'family' => array(), 'higher_than_family' => array(), 'type' => 'N/A')); } if (!empty($parts[2])) { $mix2[] = $parts[2]; } if (!empty($spb2)) { $sound_mix2[] = $spb2; } if (!empty($spc2)) { $sound_mix2[] = $spc2; } // Type 1 $query_url_1 = $ep . '&fq=canonical_name:"' . urlencode($name_cleaned) . '"'; extract_results($query_url_1, TYPE_1, $reset = false, $against); // with minor spell error $query_url_1_err_suggestion = $ep . "&rows=0&spellcheck.q=" . urlencode(implode(" ", $mix2)); $suggestion = extract_suggestion($query_url_1_err_suggestion, TYPE_1_E); if (!empty($suggestion)) { $query_url_1_err = $ep . '&fq=canonical_name:"' . urlencode("{$lpa2} {$suggestion}") . '"'; extract_results($query_url_1_err, TYPE_1_E, $reset = false, $against); } //* $query_url_1_err_long_suggestion = $ep . "&rows=0&spellcheck.q=" . urlencode($name_cleaned); $long_suggestion = extract_suggestion($query_url_1_err_long_suggestion, TYPE_1_E); if (!empty($long_suggestion)) { $query_url_1_err = $ep . '&fq=latin_part_a:' . $lpa2 . '&fq=canonical_name:"' . urlencode("{$long_suggestion}") . '"'; extract_results($query_url_1_err, TYPE_1_E, $reset = false, $against); } //*/ $all_matched_tmp = extract_results(); if (!empty($all_matched_tmp['']) || $best == 'no') { // Type 2 $query_url_2 = $ep . '&fq=latin_part_a:' . urlencode($lpa2) . '&fq=latin_part_bc:(' . urlencode(implode(' OR ', $mix2)) . ")"; extract_results($query_url_2, TYPE_2, $reset = false, $against); // with minor spell error foreach (array_unique($mix2) as $p) { $query_url_2_err_suggestion = $ep . "&rows=0&spellcheck.q=" . urlencode($p); $suggestion = extract_suggestion($query_url_2_err_suggestion, TYPE_2_E); if (!empty($suggestion)) { $suggestions[] = $suggestion; } $query_url_2_err_long_suggestion = $ep . "&rows=0&spellcheck.q=" . urlencode("{$lpa2} {$p}"); $long_suggestion = extract_suggestion($query_url_2_err_long_suggestion, TYPE_2_E); if (!empty($long_suggestion)) { $long_suggestions[] = $long_suggestion; } } if (!empty($suggestions)) { $suggestions = array_unique(array_merge($suggestions, $mix2)); $query_url_2_err = $ep . '&fq=latin_part_a:' . urlencode($lpa2) . '&fq=latin_part_bc:(' . urlencode(implode(' OR ', $suggestions)) . ")"; extract_results($query_url_2_err, TYPE_2_E, $reset = false, $against); } if (!empty($long_suggestions) && count($mix2) > 1) { foreach ($long_suggestions as $long_suggestion) { $query_url_2_err = $ep . '&fq=canonical_name:"' . urlencode($long_suggestion) . '"'; extract_results($query_url_2_err, TYPE_2_E, $reset = false, $against); } } // Genus spell error??? $query_url_2_genus_err_suggestion = $ep . "&rows=0&spellcheck.q=" . urlencode($lpa2); $suggestion = extract_suggestion($query_url_2_genus_err_suggestion, TYPE_2_GE); if (is_null($suggestion)) { $query_url_2_genus_err_suggestion = $ep . "&rows=0&spellcheck.q=" . urlencode($name_cleaned); $suggestion = array_shift(explode(" ", extract_suggestion($query_url_2_genus_err_suggestion, TYPE_2_GE))); if (is_null($suggestion)) { foreach ($mix2 as $mp) { $query_url_2_genus_err_suggestion = $ep . "&rows=0&spellcheck.q=" . urlencode($lpa2 + ' ' + $mp); $suggestion = array_shift(explode(" ", extract_suggestion($query_url_2_genus_err_suggestion, TYPE_2_GE))); if (!is_null($suggestion)) { break; } } } } if (treat_word($lpa2, true) == treat_word($suggestion, true)) { $query_url_2_genus_err = $ep . '&fq=latin_part_a:' . urlencode($suggestion) . '&fq=latin_part_bc:(' . urlencode(implode(' OR ', $mix2)) . ")"; extract_results($query_url_2_genus_err, TYPE_2_GS, $reset = false, $against); } elseif (levenshtein($lpa2, $suggestion) == 1 && strlen($lpa2) == strlen($suggestion)) { $len = strlen($lpa2); for ($i = 0; $i < $len; $i++) { if ($lpa2[$i] != $suggestion[$i]) { if (similar_char($lpa2[$i], $suggestion[$i], @$lpa2[$i + 1], @$suggestion[$i + 1])) { $query_url_2_genus_err = $ep . '&fq=latin_part_a:' . urlencode($suggestion) . '&fq=latin_part_bc:(' . urlencode(implode(' OR ', $mix2)) . ")"; extract_results($query_url_2_genus_err, TYPE_2_GL, $reset = false, $against); } } } } elseif (levenshtein($lpa2, $suggestion) == 1) { $query_url_2_genus_err = $ep . '&fq=latin_part_a:' . urlencode($suggestion) . '&fq=latin_part_bc:(' . urlencode(implode(' OR ', $mix2)) . ")"; extract_results($query_url_2_genus_err, TYPE_2_GL2, $reset = false, $against); } $all_matched_tmp = extract_results(); } if (!empty($all_matched_tmp['']) || $best == 'no') { // Type 3 $sound = treat_word($name_cleaned); $query_url_3 = $ep . '&fq=sound_name:"' . urlencode($sound) . '"'; extract_results($query_url_3, TYPE_3_S, $reset = false, $against); // Type 3 mix $query_url_3 = $ep . '&fq=sound_part_a:' . urlencode($spa2) . '&fq=sound_part_bc:(' . urlencode(implode(' OR ', $sound_mix2)) . ")"; extract_results($query_url_3, TYPE_3_S2, $reset = false, $against); $sound_mix2_strip_ending = array_map("treat_word", $mix2, array_fill(0, count($mix2), true)); $query_url_3_strip_bc_ending = $ep . '&fq=sound_part_a:' . urlencode($spa2) . '&fq=sound_part_bc_strip_ending:(' . urlencode(implode(' OR ', $sound_mix2_strip_ending)) . ")"; extract_results($query_url_3_strip_bc_ending, TYPE3_S3, $reset = false, $against); $query_url_3_strip_all_ending = $ep . '&fq=sound_part_a_strip_ending:' . urlencode(treat_word($spa2, true)) . '&fq=sound_part_bc_strip_ending:(' . urlencode(implode(' OR ', $sound_mix2_strip_ending)) . ")"; extract_results($query_url_3_strip_all_ending, TYPE_3_GUESS, $reset = false, $against); $all_matched_tmp = extract_results(); } foreach ($all_matched_tmp as $m) { $all_matched[$m['matched']] = array_merge(array('name' => $name, 'name_cleaned' => $name_cleaned), $m); } /* echo "<xmp>"; var_dump($all_matched); echo "</xmp>"; //*/ //var_dump($all_matched); return $all_matched; }
$rec['sound_genus'] = $frags[0]; $rec['sound_part_a_strip_ending'] = treat_word($frags[0], true); $rec['nameSpell'][] = $frags[0]; if (!empty($frags[1])) { $rec['latin_part_bc'][] = $frags[1]; $rec['nameSpell'][] = $frags[1]; $rec['nameSpell'][] = $frags[0] . " " . $frags[1]; $rec['sound_part_bc'][] = treat_word($frags[1]); $rec['sound_part_bc_strip_ending'][] = treat_word($frags[1], true); } else { continue; } if (!empty($frags[2])) { $rec['latin_part_bc'][] = $frags[2]; $rec['nameSpell'][] = $frags[2]; $rec['nameSpell'][] = $frags[1] . " " . $frags[2]; $rec['nameSpell'][] = $frags[0] . " " . $frags[2]; $rec['nameSpell'][] = $frags[0] . " " . $frags[1] . " " . $frags[2]; $rec['sound_part_bc'][] = treat_word($frags[2]); $rec['sound_part_bc_strip_ending'][] = treat_word($frags[2], true); } $ret[] = $rec; if ($counter % 1000 == 999) { submitJson($ret); $ret = array(); } $counter++; } if (!empty($ret)) { submitJson($ret); }