Esempio n. 1
0
function nameSimilarity($matched, $name, $type = null)
{
    if ($matched == 'N/A') {
        return 0;
    }
    //$matched_cleaned = canonical_form($matched, true);
    $matched_cleaned = $matched;
    if (empty($matched_cleaned)) {
        return 0;
    }
    $score = 3;
    $parts1 = explode(" ", $matched_cleaned);
    $parts2 = explode(" ", $name);
    if (count($parts1) === count($parts2)) {
        if (levenshtein($matched_cleaned, $name) <= 3 || preg_match('/full/i', $type)) {
            $score += 0.5;
        }
        for ($pidx = 0; $pidx < count($parts1); $pidx++) {
            if ($parts1[$pidx][0] != $parts2[$pidx][0]) {
                $score -= 1.5;
            }
        }
    }
    if (preg_match('/sound|look/i', $type)) {
        // $score -= 0.05;
        $score -= levenshtein($name, $matched_cleaned) / 20;
    }
    $penalty = 0;
    if (count($parts1) == count($parts2)) {
        $penalty = 0;
    } elseif (count(array_unique($parts1)) != count(array_unique($parts2))) {
        $penalty = 0.01;
    }
    if (count(array_unique($parts1)) > count(array_unique($parts2))) {
        $penalty += 0.5;
        //		$penalty = $penalty / (3 - count(array_unique(array_slice($parts1, 1))));
    } elseif (count(array_unique($parts1)) < count(array_unique($parts2))) {
        if (count(array_unique($parts1)) < count($parts1)) {
            $penalty -= 0.0;
        } else {
            $penalty -= 0.2;
        }
        //		$penalty = ($penalty < 0)?0:$penalty;
    }
    $score -= 2 * levenshtein($parts1[0], $parts2[0]) / strlen($parts1[0]);
    $sub_parts1 = array_slice($parts1, 1);
    $sub_parts2 = array_slice($parts2, 1);
    $total_err = 0;
    foreach ($sub_parts2 as $sp2_idx => $sp2) {
        //		$min_err = 999.0;
        //		$min_errs[$sp1] = 999.0;
        if (is_null(@$min_errs[$sp2])) {
            $min_errs[$sp2] = 999.0;
        }
        foreach ($sub_parts1 as $sp1_idx => $sp1) {
            if (levenshtein($sp1, $sp2) <= 3 && treat_word($sp1[0]) == treat_word($sp2[0])) {
                //				echo "<xmp>$sp1 $sp2 ". levenshtein($sp1, $sp2) . "</xmp>";
                $tmp_err = (double) levenshtein($sp1, $sp2) / (double) strlen($sp1);
                //				$min_err = min($min_err, $tmp_err);
                $min_errs[$sp2] = min($min_errs[$sp2], $tmp_err);
                //				echo "$sp1, $sp2, $min_err, $total_err, $matched<br/>";
            } else {
                //				$min_err = 0.5;
                if (count($sub_parts1) != count($sub_parts2)) {
                    $factor = count($sub_parts1) + count($sub_parts2) - ($sp1_idx + $sp2_idx + 1);
                } else {
                    $factor = 1;
                }
                /*
                echo "<xmp>";
                var_dump(array($sp2, $sp1, $factor, min($min_errs[$sp2], 1 / $factor)));
                echo "</xmp>";
                //*/
                $min_errs[$sp2] = min($min_errs[$sp2], 1 / $factor);
                /*
                echo "<xmp>";
                var_dump(array($sub_parts1, $sub_parts2));
                echo "</xmp>";
                //*/
            }
        }
    }
    /*
    		echo "<xmp>";
    		var_dump($min_errs);
    		echo "</xmp>";
    		//*/
    foreach (array_unique($sub_parts2) as $sp2) {
        $total_err += $min_errs[$sp2];
    }
    //	$score -= (($total_err>1.5)?1.5:$total_err);
    $score -= $total_err;
    /*
    		echo "<xmp>";
    		var_dump(array($matched, $penalty, $score, $score - $penalty));
    		echo "</xmp>";
    		//*/
    return $score - $penalty;
}
Esempio n. 2
0
function queryNames($name, $against, $best, $ep)
{
    if (empty($ep)) {
        return false;
    }
    $ep .= '/select?wt=json&q=*:*';
    // $ep = 'http://localhost:8983/solr/taxa/select?wt=json&q=*:*';
    // $ep = 'http://140.109.28.72/solr4/taxa/select?wt=json&q=*:*';
    extract_results("", "", $reset = true);
    // mix2; work with latin part b2, c2, and suggestions of latin part b2, c2
    $mix2 = array();
    $sound_mix2 = array();
    $matched = array();
    $info = array();
    $suggestions = array();
    $long_suggestions = array();
    $name_cleaned = canonical_form($name, true);
    $parts = explode(" ", $name_cleaned);
    $lpa2 = $parts[0];
    $lpb2 = @$parts[1];
    $lpc2 = @$parts[2];
    $spa2 = treat_word($lpa2);
    $spb2 = treat_word($lpb2);
    $spc2 = treat_word($lpc2);
    if (!empty($parts[1])) {
        $mix2[] = $parts[1];
    } else {
        //		return null;
        return array('N/A' => array('name' => $name, 'name_cleaned' => $name_cleaned, 'matched' => 'N/A', 'matched_clean' => 'N/A', 'accepted_namecode' => array(), 'namecode' => array(), 'source' => array(), 'url_id' => array(), 'a_url_id' => array(), 'kingdom' => array(), 'phylum' => array(), 'class' => array(), 'order' => array(), 'family' => array(), 'higher_than_family' => array(), 'type' => 'N/A'));
    }
    if (!empty($parts[2])) {
        $mix2[] = $parts[2];
    }
    if (!empty($spb2)) {
        $sound_mix2[] = $spb2;
    }
    if (!empty($spc2)) {
        $sound_mix2[] = $spc2;
    }
    // Type 1
    $query_url_1 = $ep . '&fq=canonical_name:"' . urlencode($name_cleaned) . '"';
    extract_results($query_url_1, TYPE_1, $reset = false, $against);
    // with minor spell error
    $query_url_1_err_suggestion = $ep . "&rows=0&spellcheck.q=" . urlencode(implode(" ", $mix2));
    $suggestion = extract_suggestion($query_url_1_err_suggestion, TYPE_1_E);
    if (!empty($suggestion)) {
        $query_url_1_err = $ep . '&fq=canonical_name:"' . urlencode("{$lpa2} {$suggestion}") . '"';
        extract_results($query_url_1_err, TYPE_1_E, $reset = false, $against);
    }
    //*
    $query_url_1_err_long_suggestion = $ep . "&rows=0&spellcheck.q=" . urlencode($name_cleaned);
    $long_suggestion = extract_suggestion($query_url_1_err_long_suggestion, TYPE_1_E);
    if (!empty($long_suggestion)) {
        $query_url_1_err = $ep . '&fq=latin_part_a:' . $lpa2 . '&fq=canonical_name:"' . urlencode("{$long_suggestion}") . '"';
        extract_results($query_url_1_err, TYPE_1_E, $reset = false, $against);
    }
    //*/
    $all_matched_tmp = extract_results();
    if (!empty($all_matched_tmp['']) || $best == 'no') {
        // Type 2
        $query_url_2 = $ep . '&fq=latin_part_a:' . urlencode($lpa2) . '&fq=latin_part_bc:(' . urlencode(implode(' OR ', $mix2)) . ")";
        extract_results($query_url_2, TYPE_2, $reset = false, $against);
        // with minor spell error
        foreach (array_unique($mix2) as $p) {
            $query_url_2_err_suggestion = $ep . "&rows=0&spellcheck.q=" . urlencode($p);
            $suggestion = extract_suggestion($query_url_2_err_suggestion, TYPE_2_E);
            if (!empty($suggestion)) {
                $suggestions[] = $suggestion;
            }
            $query_url_2_err_long_suggestion = $ep . "&rows=0&spellcheck.q=" . urlencode("{$lpa2} {$p}");
            $long_suggestion = extract_suggestion($query_url_2_err_long_suggestion, TYPE_2_E);
            if (!empty($long_suggestion)) {
                $long_suggestions[] = $long_suggestion;
            }
        }
        if (!empty($suggestions)) {
            $suggestions = array_unique(array_merge($suggestions, $mix2));
            $query_url_2_err = $ep . '&fq=latin_part_a:' . urlencode($lpa2) . '&fq=latin_part_bc:(' . urlencode(implode(' OR ', $suggestions)) . ")";
            extract_results($query_url_2_err, TYPE_2_E, $reset = false, $against);
        }
        if (!empty($long_suggestions) && count($mix2) > 1) {
            foreach ($long_suggestions as $long_suggestion) {
                $query_url_2_err = $ep . '&fq=canonical_name:"' . urlencode($long_suggestion) . '"';
                extract_results($query_url_2_err, TYPE_2_E, $reset = false, $against);
            }
        }
        // Genus spell error???
        $query_url_2_genus_err_suggestion = $ep . "&rows=0&spellcheck.q=" . urlencode($lpa2);
        $suggestion = extract_suggestion($query_url_2_genus_err_suggestion, TYPE_2_GE);
        if (is_null($suggestion)) {
            $query_url_2_genus_err_suggestion = $ep . "&rows=0&spellcheck.q=" . urlencode($name_cleaned);
            $suggestion = array_shift(explode(" ", extract_suggestion($query_url_2_genus_err_suggestion, TYPE_2_GE)));
            if (is_null($suggestion)) {
                foreach ($mix2 as $mp) {
                    $query_url_2_genus_err_suggestion = $ep . "&rows=0&spellcheck.q=" . urlencode($lpa2 + ' ' + $mp);
                    $suggestion = array_shift(explode(" ", extract_suggestion($query_url_2_genus_err_suggestion, TYPE_2_GE)));
                    if (!is_null($suggestion)) {
                        break;
                    }
                }
            }
        }
        if (treat_word($lpa2, true) == treat_word($suggestion, true)) {
            $query_url_2_genus_err = $ep . '&fq=latin_part_a:' . urlencode($suggestion) . '&fq=latin_part_bc:(' . urlencode(implode(' OR ', $mix2)) . ")";
            extract_results($query_url_2_genus_err, TYPE_2_GS, $reset = false, $against);
        } elseif (levenshtein($lpa2, $suggestion) == 1 && strlen($lpa2) == strlen($suggestion)) {
            $len = strlen($lpa2);
            for ($i = 0; $i < $len; $i++) {
                if ($lpa2[$i] != $suggestion[$i]) {
                    if (similar_char($lpa2[$i], $suggestion[$i], @$lpa2[$i + 1], @$suggestion[$i + 1])) {
                        $query_url_2_genus_err = $ep . '&fq=latin_part_a:' . urlencode($suggestion) . '&fq=latin_part_bc:(' . urlencode(implode(' OR ', $mix2)) . ")";
                        extract_results($query_url_2_genus_err, TYPE_2_GL, $reset = false, $against);
                    }
                }
            }
        } elseif (levenshtein($lpa2, $suggestion) == 1) {
            $query_url_2_genus_err = $ep . '&fq=latin_part_a:' . urlencode($suggestion) . '&fq=latin_part_bc:(' . urlencode(implode(' OR ', $mix2)) . ")";
            extract_results($query_url_2_genus_err, TYPE_2_GL2, $reset = false, $against);
        }
        $all_matched_tmp = extract_results();
    }
    if (!empty($all_matched_tmp['']) || $best == 'no') {
        // Type 3
        $sound = treat_word($name_cleaned);
        $query_url_3 = $ep . '&fq=sound_name:"' . urlencode($sound) . '"';
        extract_results($query_url_3, TYPE_3_S, $reset = false, $against);
        // Type 3 mix
        $query_url_3 = $ep . '&fq=sound_part_a:' . urlencode($spa2) . '&fq=sound_part_bc:(' . urlencode(implode(' OR ', $sound_mix2)) . ")";
        extract_results($query_url_3, TYPE_3_S2, $reset = false, $against);
        $sound_mix2_strip_ending = array_map("treat_word", $mix2, array_fill(0, count($mix2), true));
        $query_url_3_strip_bc_ending = $ep . '&fq=sound_part_a:' . urlencode($spa2) . '&fq=sound_part_bc_strip_ending:(' . urlencode(implode(' OR ', $sound_mix2_strip_ending)) . ")";
        extract_results($query_url_3_strip_bc_ending, TYPE3_S3, $reset = false, $against);
        $query_url_3_strip_all_ending = $ep . '&fq=sound_part_a_strip_ending:' . urlencode(treat_word($spa2, true)) . '&fq=sound_part_bc_strip_ending:(' . urlencode(implode(' OR ', $sound_mix2_strip_ending)) . ")";
        extract_results($query_url_3_strip_all_ending, TYPE_3_GUESS, $reset = false, $against);
        $all_matched_tmp = extract_results();
    }
    foreach ($all_matched_tmp as $m) {
        $all_matched[$m['matched']] = array_merge(array('name' => $name, 'name_cleaned' => $name_cleaned), $m);
    }
    /*
    echo "<xmp>";
    var_dump($all_matched);
    echo "</xmp>";
    //*/
    //var_dump($all_matched);
    return $all_matched;
}
Esempio n. 3
0
    $rec['sound_genus'] = $frags[0];
    $rec['sound_part_a_strip_ending'] = treat_word($frags[0], true);
    $rec['nameSpell'][] = $frags[0];
    if (!empty($frags[1])) {
        $rec['latin_part_bc'][] = $frags[1];
        $rec['nameSpell'][] = $frags[1];
        $rec['nameSpell'][] = $frags[0] . " " . $frags[1];
        $rec['sound_part_bc'][] = treat_word($frags[1]);
        $rec['sound_part_bc_strip_ending'][] = treat_word($frags[1], true);
    } else {
        continue;
    }
    if (!empty($frags[2])) {
        $rec['latin_part_bc'][] = $frags[2];
        $rec['nameSpell'][] = $frags[2];
        $rec['nameSpell'][] = $frags[1] . " " . $frags[2];
        $rec['nameSpell'][] = $frags[0] . " " . $frags[2];
        $rec['nameSpell'][] = $frags[0] . " " . $frags[1] . " " . $frags[2];
        $rec['sound_part_bc'][] = treat_word($frags[2]);
        $rec['sound_part_bc_strip_ending'][] = treat_word($frags[2], true);
    }
    $ret[] = $rec;
    if ($counter % 1000 == 999) {
        submitJson($ret);
        $ret = array();
    }
    $counter++;
}
if (!empty($ret)) {
    submitJson($ret);
}