/** * Calculate the Daitch-Mokotoff soundex for a word. * * @param string $name * * @return string[] List of possible DM codes for the word. */ private static function daitchMokotoffWord($name) { // Apply special transformation rules to the input string $name = I18N::strtoupper($name); foreach (self::$transformNameTable as $transformRule) { $name = str_replace($transformRule[0], $transformRule[1], $name); } // Initialize $name_script = I18N::textScript($name); $noVowels = $name_script == 'Hebr' || $name_script == 'Arab'; $lastPos = strlen($name) - 1; $currPos = 0; $state = 1; // 1: start of input string, 2: before vowel, 3: other $result = array(); // accumulate complete 6-digit D-M codes here $partialResult = array(); // accumulate incomplete D-M codes here $partialResult[] = array('!'); // initialize 1st partial result ('!' stops "duplicate sound" check) // Loop through the input string. // Stop when the string is exhausted or when no more partial results remain while (count($partialResult) !== 0 && $currPos <= $lastPos) { // Find the DM coding table entry for the chunk at the current position $thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk while ($thisEntry != '') { if (isset(self::$dmsounds[$thisEntry])) { break; } $thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk } if ($thisEntry === '') { $currPos++; // Not in table: advance pointer to next byte continue; // and try again } $soundTableEntry = self::$dmsounds[$thisEntry]; $workingResult = $partialResult; $partialResult = array(); $currPos += strlen($thisEntry); // Not at beginning of input string if ($state != 1) { if ($currPos <= $lastPos) { // Determine whether the next chunk is a vowel $nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk while ($nextEntry != '') { if (isset(self::$dmsounds[$nextEntry])) { break; } $nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk } } else { $nextEntry = ''; } if ($nextEntry != '' && self::$dmsounds[$nextEntry][0] != '0') { $state = 2; } else { // Next chunk is a vowel $state = 3; } } while ($state < count($soundTableEntry)) { // empty means 'ignore this sound in this state' if ($soundTableEntry[$state] == '') { foreach ($workingResult as $workingEntry) { $tempEntry = $workingEntry; $tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles' $partialResult[] = $tempEntry; } } else { foreach ($workingResult as $workingEntry) { if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) { // Incoming sound isn't a duplicate of the previous sound $workingEntry[] = $soundTableEntry[$state]; } else { // Incoming sound is a duplicate of the previous sound // For Hebrew and Arabic, we need to create a pair of D-M sound codes, // one of the pair with only a single occurrence of the duplicate sound, // the other with both occurrences if ($noVowels) { $workingEntry[] = $soundTableEntry[$state]; } } if (count($workingEntry) < 7) { $partialResult[] = $workingEntry; } else { // This is the 6th code in the sequence // We're looking for 7 entries because the first is '!' and doesn't count $tempResult = str_replace('!', '', implode('', $workingEntry)); // Only return codes from recognisable sounds if ($tempResult) { $result[] = substr($tempResult . '000000', 0, 6); } } } } $state = $state + 3; // Advance to next triplet while keeping the same basic state } } // Zero-fill and copy all remaining partial results foreach ($partialResult as $workingEntry) { $tempResult = str_replace('!', '', implode('', $workingEntry)); // Only return codes from recognisable sounds if ($tempResult) { $result[] = substr($tempResult . '000000', 0, 6); } } return $result; }