示例#1
0
 /**
  * Calculate the Daitch-Mokotoff soundex for a word.
  *
  * @param string $name
  *
  * @return string[] List of possible DM codes for the word.
  */
 private static function daitchMokotoffWord($name)
 {
     // Apply special transformation rules to the input string
     $name = I18N::strtoupper($name);
     foreach (self::$transformNameTable as $transformRule) {
         $name = str_replace($transformRule[0], $transformRule[1], $name);
     }
     // Initialize
     $name_script = I18N::textScript($name);
     $noVowels = $name_script == 'Hebr' || $name_script == 'Arab';
     $lastPos = strlen($name) - 1;
     $currPos = 0;
     $state = 1;
     // 1: start of input string, 2: before vowel, 3: other
     $result = array();
     // accumulate complete 6-digit D-M codes here
     $partialResult = array();
     // accumulate incomplete D-M codes here
     $partialResult[] = array('!');
     // initialize 1st partial result  ('!' stops "duplicate sound" check)
     // Loop through the input string.
     // Stop when the string is exhausted or when no more partial results remain
     while (count($partialResult) !== 0 && $currPos <= $lastPos) {
         // Find the DM coding table entry for the chunk at the current position
         $thisEntry = substr($name, $currPos, self::MAXCHAR);
         // Get maximum length chunk
         while ($thisEntry != '') {
             if (isset(self::$dmsounds[$thisEntry])) {
                 break;
             }
             $thisEntry = substr($thisEntry, 0, -1);
             // Not in table: try a shorter chunk
         }
         if ($thisEntry === '') {
             $currPos++;
             // Not in table: advance pointer to next byte
             continue;
             // and try again
         }
         $soundTableEntry = self::$dmsounds[$thisEntry];
         $workingResult = $partialResult;
         $partialResult = array();
         $currPos += strlen($thisEntry);
         // Not at beginning of input string
         if ($state != 1) {
             if ($currPos <= $lastPos) {
                 // Determine whether the next chunk is a vowel
                 $nextEntry = substr($name, $currPos, self::MAXCHAR);
                 // Get maximum length chunk
                 while ($nextEntry != '') {
                     if (isset(self::$dmsounds[$nextEntry])) {
                         break;
                     }
                     $nextEntry = substr($nextEntry, 0, -1);
                     // Not in table: try a shorter chunk
                 }
             } else {
                 $nextEntry = '';
             }
             if ($nextEntry != '' && self::$dmsounds[$nextEntry][0] != '0') {
                 $state = 2;
             } else {
                 // Next chunk is a vowel
                 $state = 3;
             }
         }
         while ($state < count($soundTableEntry)) {
             // empty means 'ignore this sound in this state'
             if ($soundTableEntry[$state] == '') {
                 foreach ($workingResult as $workingEntry) {
                     $tempEntry = $workingEntry;
                     $tempEntry[count($tempEntry) - 1] .= '!';
                     // Prevent false 'doubles'
                     $partialResult[] = $tempEntry;
                 }
             } else {
                 foreach ($workingResult as $workingEntry) {
                     if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) {
                         // Incoming sound isn't a duplicate of the previous sound
                         $workingEntry[] = $soundTableEntry[$state];
                     } else {
                         // Incoming sound is a duplicate of the previous sound
                         // For Hebrew and Arabic, we need to create a pair of D-M sound codes,
                         // one of the pair with only a single occurrence of the duplicate sound,
                         // the other with both occurrences
                         if ($noVowels) {
                             $workingEntry[] = $soundTableEntry[$state];
                         }
                     }
                     if (count($workingEntry) < 7) {
                         $partialResult[] = $workingEntry;
                     } else {
                         // This is the 6th code in the sequence
                         // We're looking for 7 entries because the first is '!' and doesn't count
                         $tempResult = str_replace('!', '', implode('', $workingEntry));
                         // Only return codes from recognisable sounds
                         if ($tempResult) {
                             $result[] = substr($tempResult . '000000', 0, 6);
                         }
                     }
                 }
             }
             $state = $state + 3;
             // Advance to next triplet while keeping the same basic state
         }
     }
     // Zero-fill and copy all remaining partial results
     foreach ($partialResult as $workingEntry) {
         $tempResult = str_replace('!', '', implode('', $workingEntry));
         // Only return codes from recognisable sounds
         if ($tempResult) {
             $result[] = substr($tempResult . '000000', 0, 6);
         }
     }
     return $result;
 }