Exemplo n.º 1
0
 /**
  * Perform an intelligent detection based on clusterLanguages()
  *
  * WARNING: this method is EXPERIMENTAL. It is not recommended for common
  * use, and it may disappear or its functionality may change in future
  * releases without notice.
  *
  * This compares the sample text to top the top level of clusters. If the
  * sample is similar to the cluster it will drop down and compare it to the
  * languages in the cluster, and so on until it hits a leaf node.
  *
  * this should find the language in considerably fewer compares
  * (the equivalent of a binary search), however clusterLanguages() is costly
  * and the loss of accuracy from this technique is significant.
  *
  * This method may need to be 'fuzzier' in order to become more accurate.
  *
  * This function could be more useful if the universe of possible languages
  * was very large, however in such cases some method of Bayesian inference
  * might be more helpful.
  *
  * @param string $str input string
  *
  * @return array language scores (only those compared)
  * @throws Text_LanguageDetect_Exception
  * @see    clusterLanguages()
  */
 public function clusteredSearch($str)
 {
     // input check
     if (!Text_LanguageDetect_Parser::validateString($str)) {
         return array();
     }
     // clusterLanguages() will return a cached result if possible
     // so it's safe to call it every time
     $result = $this->clusterLanguages();
     $dendogram_start = $result['open_forks'];
     $dendogram_data = $result['fork_data'];
     $dendogram_alias = $result['name_map'];
     $sample_obj = new Text_LanguageDetect_Parser($str);
     $sample_obj->prepareTrigram();
     $sample_obj->setPadStart(!$this->_perl_compatible);
     $sample_obj->analyze();
     $sample_result = $sample_obj->getTrigramRanks();
     $sample_count = count($sample_result);
     // input check
     if ($sample_count == 0) {
         return array();
     }
     $i = 0;
     // counts the number of steps
     foreach ($dendogram_start as $lang) {
         if (isset($dendogram_alias[$lang])) {
             $lang_key = $dendogram_alias[$lang];
         } else {
             $lang_key = $lang;
         }
         $scores[$lang] = $this->_normalize_score($this->_distance($this->_lang_db[$lang_key], $sample_result), $sample_count);
         $i++;
     }
     if ($this->_perl_compatible) {
         asort($scores);
     } else {
         arsort($scores);
     }
     $top_score = current($scores);
     $top_key = key($scores);
     // of starting forks, $top_key is the most similar to the sample
     $cur_key = $top_key;
     while (isset($dendogram_data[$cur_key])) {
         $lang1 = $dendogram_data[$cur_key]['bestfit'];
         $lang2 = $dendogram_data[$cur_key]['otherfit'];
         foreach (array($lang1, $lang2) as $lang) {
             if (isset($dendogram_alias[$lang])) {
                 $lang_key = $dendogram_alias[$lang];
             } else {
                 $lang_key = $lang;
             }
             $scores[$lang] = $this->_normalize_score($this->_distance($this->_lang_db[$lang_key], $sample_result), $sample_count);
             //todo: does not need to do same comparison again
         }
         $i++;
         if ($scores[$lang1] > $scores[$lang2]) {
             $cur_key = $lang1;
             $loser_key = $lang2;
         } else {
             $cur_key = $lang2;
             $loser_key = $lang1;
         }
         $diff = $scores[$cur_key] - $scores[$loser_key];
         // $cur_key ({$dendogram_alias[$cur_key]}) wins
         // over $loser_key ({$dendogram_alias[$loser_key]})
         // with a difference of $diff
     }
     // found result in $i compares
     // rather than sorting the result, preserve it so that you can see
     // which paths the algorithm decided to take along the tree
     // but sometimes the last item is only the second highest
     if ($this->_perl_compatible && end($scores) > prev($scores) || !$this->_perl_compatible && end($scores) < prev($scores)) {
         $real_last_score = current($scores);
         $real_last_key = key($scores);
         // swaps the 2nd-to-last item for the last item
         unset($scores[$real_last_key]);
         $scores[$real_last_key] = $real_last_score;
     }
     if (!$this->_perl_compatible) {
         $scores = array_reverse($scores, true);
         // second param requires php > 4.0.3
     }
     return $scores;
 }