function compareComplexMulti($a1, $a2) { $intersection = array_intersect($a1, $a2); // similarity of names, no matter of name order $weightIdentical = count($intersection); $weightTotal = max(count($a1), count($a2)); $weightSimilar = 0; $a1Available = array_diff($a1, $intersection); $weightSimilarTotal = count($a1Available); $a2Available = array_diff($a2, $intersection); while (count($a1Available)) { $v1 = array_shift($a1Available); // build comparison array $comparison = array(); $exactMatch = false; foreach ($a2Available as $v2) { if ($v1 == $v2) { $exactMatch = true; break; } } if ($exactMatch) { $comparison[] = 1; } else { foreach ($a2Available as $v2) { // calculate similarity based on weights $comparison[] = compareComplex($v1, $v2); } } // get maximal match arsort($comparison); $index = array_shift(array_keys($comparison)); $weightSimilar += $comparison[$index] / 100; unset($a2Available[$index]); } unset($a1Available); unset($a2Available); unset($comparison); unset($intersection); #echo "$weightIdentical - $weightSimilar - $weightTotal<br>"; $similarity = ($weightIdentical + $weightSimilar) / $weightTotal * 100; return $similarity; }
function compareTwoContacts($c1, $c2) { global $thresholdUsernameSimilarity, $thresholdNameSimilarity; logMsg('DEBUG', "Comparing " . $c1['email'] . " with " . $c2['email']); // merge primary name with secondary names $c1Names = $c1['secondaryNames']; $c2Names = $c2['secondaryNames']; if ($c1['name']) { $c1Names[] = $c1['name']; } if ($c2['name']) { $c2Names[] = $c2['name']; } // if we have names if (count($c1Names) && count($c2Names)) { // clean names array_walk($c1Names, 'cleanName'); array_walk($c2Names, 'cleanName'); // check for exact items $commonNames = array_intersect($c1Names, $c2Names); } $c1Username = $c1['usernames'][0]; cleanName($c1Username); $c2Username = $c2['usernames'][0]; cleanName($c2Username); // check similarity if (!empty($commonNames)) { // full names match return 1; } elseif ($c1Username && $c2Username && ($usernameSimilarity = compareComplex($c1Username, $c2Username)) && $usernameSimilarity > $thresholdUsernameSimilarity) { // very high username similarity logMsg('DEBUG', "Username similarity between " . $c1Username . " and " . $c2Username . " is {$usernameSimilarity}"); return 1; } elseif (count($c1Names) && count($c2Names)) { // check names similarity $nameSimilarity = 0; foreach ($c1Names as $c1Name) { if (!$c1Name) { continue; } $c1NamesSplit = explode(' ', $c1Name); $nameSimilarity = 0; foreach ($c2Names as $c2Name) { if (!$c2Name) { continue; } $c2NamesSplit = explode(' ', $c2Name); $similarity = compareComplexMulti($c1NamesSplit, $c2NamesSplit); if ($similarity > $nameSimilarity) { logMsg('DEBUG', "Name similarity between " . $c1Name . " and " . $c2Name . " is {$similarity}"); $nameSimilarity = $similarity; } } } unset($c1Names); unset($c2Names); unset($c1NamesSplit); unset($c2NamesSplit); return $nameSimilarity > $thresholdUsernameSimilarity ? 1 : 0; } else { unset($c1Names); unset($c2Names); return 0; } }