/** * Perform an intelligent detection based on clusterLanguages() * * WARNING: this method is EXPERIMENTAL. It is not recommended for common * use, and it may disappear or its functionality may change in future * releases without notice. * * This compares the sample text to top the top level of clusters. If the * sample is similar to the cluster it will drop down and compare it to the * languages in the cluster, and so on until it hits a leaf node. * * this should find the language in considerably fewer compares * (the equivalent of a binary search), however clusterLanguages() is costly * and the loss of accuracy from this technique is significant. * * This method may need to be 'fuzzier' in order to become more accurate. * * This function could be more useful if the universe of possible languages * was very large, however in such cases some method of Bayesian inference * might be more helpful. * * @param string $str input string * * @return array language scores (only those compared) * @throws Text_LanguageDetect_Exception * @see clusterLanguages() */ public function clusteredSearch($str) { // input check if (!Text_LanguageDetect_Parser::validateString($str)) { return array(); } // clusterLanguages() will return a cached result if possible // so it's safe to call it every time $result = $this->clusterLanguages(); $dendogram_start = $result['open_forks']; $dendogram_data = $result['fork_data']; $dendogram_alias = $result['name_map']; $sample_obj = new Text_LanguageDetect_Parser($str); $sample_obj->prepareTrigram(); $sample_obj->setPadStart(!$this->_perl_compatible); $sample_obj->analyze(); $sample_result = $sample_obj->getTrigramRanks(); $sample_count = count($sample_result); // input check if ($sample_count == 0) { return array(); } $i = 0; // counts the number of steps foreach ($dendogram_start as $lang) { if (isset($dendogram_alias[$lang])) { $lang_key = $dendogram_alias[$lang]; } else { $lang_key = $lang; } $scores[$lang] = $this->_normalize_score($this->_distance($this->_lang_db[$lang_key], $sample_result), $sample_count); $i++; } if ($this->_perl_compatible) { asort($scores); } else { arsort($scores); } $top_score = current($scores); $top_key = key($scores); // of starting forks, $top_key is the most similar to the sample $cur_key = $top_key; while (isset($dendogram_data[$cur_key])) { $lang1 = $dendogram_data[$cur_key]['bestfit']; $lang2 = $dendogram_data[$cur_key]['otherfit']; foreach (array($lang1, $lang2) as $lang) { if (isset($dendogram_alias[$lang])) { $lang_key = $dendogram_alias[$lang]; } else { $lang_key = $lang; } $scores[$lang] = $this->_normalize_score($this->_distance($this->_lang_db[$lang_key], $sample_result), $sample_count); //todo: does not need to do same comparison again } $i++; if ($scores[$lang1] > $scores[$lang2]) { $cur_key = $lang1; $loser_key = $lang2; } else { $cur_key = $lang2; $loser_key = $lang1; } $diff = $scores[$cur_key] - $scores[$loser_key]; // $cur_key ({$dendogram_alias[$cur_key]}) wins // over $loser_key ({$dendogram_alias[$loser_key]}) // with a difference of $diff } // found result in $i compares // rather than sorting the result, preserve it so that you can see // which paths the algorithm decided to take along the tree // but sometimes the last item is only the second highest if ($this->_perl_compatible && end($scores) > prev($scores) || !$this->_perl_compatible && end($scores) < prev($scores)) { $real_last_score = current($scores); $real_last_key = key($scores); // swaps the 2nd-to-last item for the last item unset($scores[$real_last_key]); $scores[$real_last_key] = $real_last_score; } if (!$this->_perl_compatible) { $scores = array_reverse($scores, true); // second param requires php > 4.0.3 } return $scores; }