예제 #1
0
function compareComplexMulti($a1, $a2)
{
    $intersection = array_intersect($a1, $a2);
    // similarity of names, no matter of name order
    $weightIdentical = count($intersection);
    $weightTotal = max(count($a1), count($a2));
    $weightSimilar = 0;
    $a1Available = array_diff($a1, $intersection);
    $weightSimilarTotal = count($a1Available);
    $a2Available = array_diff($a2, $intersection);
    while (count($a1Available)) {
        $v1 = array_shift($a1Available);
        // build comparison array
        $comparison = array();
        $exactMatch = false;
        foreach ($a2Available as $v2) {
            if ($v1 == $v2) {
                $exactMatch = true;
                break;
            }
        }
        if ($exactMatch) {
            $comparison[] = 1;
        } else {
            foreach ($a2Available as $v2) {
                // calculate similarity based on weights
                $comparison[] = compareComplex($v1, $v2);
            }
        }
        // get maximal match
        arsort($comparison);
        $index = array_shift(array_keys($comparison));
        $weightSimilar += $comparison[$index] / 100;
        unset($a2Available[$index]);
    }
    unset($a1Available);
    unset($a2Available);
    unset($comparison);
    unset($intersection);
    #echo "$weightIdentical - $weightSimilar - $weightTotal<br>";
    $similarity = ($weightIdentical + $weightSimilar) / $weightTotal * 100;
    return $similarity;
}
예제 #2
0
function compareTwoContacts($c1, $c2)
{
    global $thresholdUsernameSimilarity, $thresholdNameSimilarity;
    logMsg('DEBUG', "Comparing " . $c1['email'] . " with " . $c2['email']);
    // merge primary name with secondary names
    $c1Names = $c1['secondaryNames'];
    $c2Names = $c2['secondaryNames'];
    if ($c1['name']) {
        $c1Names[] = $c1['name'];
    }
    if ($c2['name']) {
        $c2Names[] = $c2['name'];
    }
    // if we have names
    if (count($c1Names) && count($c2Names)) {
        // clean names
        array_walk($c1Names, 'cleanName');
        array_walk($c2Names, 'cleanName');
        // check for exact items
        $commonNames = array_intersect($c1Names, $c2Names);
    }
    $c1Username = $c1['usernames'][0];
    cleanName($c1Username);
    $c2Username = $c2['usernames'][0];
    cleanName($c2Username);
    // check similarity
    if (!empty($commonNames)) {
        // full names match
        return 1;
    } elseif ($c1Username && $c2Username && ($usernameSimilarity = compareComplex($c1Username, $c2Username)) && $usernameSimilarity > $thresholdUsernameSimilarity) {
        // very high username similarity
        logMsg('DEBUG', "Username similarity between " . $c1Username . " and " . $c2Username . " is {$usernameSimilarity}");
        return 1;
    } elseif (count($c1Names) && count($c2Names)) {
        // check names similarity
        $nameSimilarity = 0;
        foreach ($c1Names as $c1Name) {
            if (!$c1Name) {
                continue;
            }
            $c1NamesSplit = explode(' ', $c1Name);
            $nameSimilarity = 0;
            foreach ($c2Names as $c2Name) {
                if (!$c2Name) {
                    continue;
                }
                $c2NamesSplit = explode(' ', $c2Name);
                $similarity = compareComplexMulti($c1NamesSplit, $c2NamesSplit);
                if ($similarity > $nameSimilarity) {
                    logMsg('DEBUG', "Name similarity between " . $c1Name . " and " . $c2Name . " is {$similarity}");
                    $nameSimilarity = $similarity;
                }
            }
        }
        unset($c1Names);
        unset($c2Names);
        unset($c1NamesSplit);
        unset($c2NamesSplit);
        return $nameSimilarity > $thresholdUsernameSimilarity ? 1 : 0;
    } else {
        unset($c1Names);
        unset($c2Names);
        return 0;
    }
}