예제 #1
0
 function GuessLanguage($text, $lang = false)
 {
     if (strlen($text) <= 0) {
         return false;
     }
     static $cache = array();
     if (empty($cache)) {
         $cache[] = "en";
         //English is always in mind and on the first place
         $rsLanguages = CLanguage::GetList($b = "", $o = "");
         while ($arLanguage = $rsLanguages->Fetch()) {
             if ($arLanguage["LID"] != "en") {
                 $cache[] = $arLanguage["LID"];
             }
         }
     }
     if (is_array($lang)) {
         $arLanguages = $lang;
     } else {
         $arLanguages = $cache;
     }
     if (count($arLanguages) < 2) {
         return false;
     }
     $languages_from = array();
     $max_len = 0;
     //Give customized languages a chance to guess
     foreach ($arLanguages as $lang) {
         $ob = CSearchLanguage::GetLanguage($lang);
         $res = $ob->PreGuessLanguage($text, $lang);
         if (is_array($res)) {
             return $res;
         } elseif ($res === true) {
             return false;
         }
     }
     //First try to detect language which
     //was used to type the phrase
     foreach ($arLanguages as $lang) {
         $ob = CSearchLanguage::GetLanguage($lang);
         $arScanCodesTmp1 = $ob->ConvertToScancode($text, true);
         $arScanCodesTmp2_cnt = count(array_filter($arScanCodesTmp1));
         //It will be one with most converted chars
         if ($arScanCodesTmp2_cnt > $max_len) {
             $max_len = $arScanCodesTmp2_cnt;
             $languages_from = array($lang => $arScanCodesTmp1);
         } elseif ($arScanCodesTmp2_cnt == $max_len) {
             $languages_from[$lang] = $arScanCodesTmp1;
         }
     }
     if ($max_len < 2) {
         return false;
     }
     if (count($languages_from) <= 0) {
         return false;
     }
     //If more than one language is detected as input
     //try to get one with best trigram info
     $arDetectionFrom = array();
     $i = 0;
     foreach ($languages_from as $lang => $arScanCodes) {
         $arDetectionFrom[$lang] = array();
         $ob = CSearchLanguage::GetLanguage($lang);
         $arDetectionFrom[$lang][] = $ob->HasTrigrams();
         $arDetectionFrom[$lang][] = $ob->CheckTrigrams($arScanCodes);
         //Calculate how far sequence of scan codes
         //is from language model
         //$deviation = $ob->GetDeviation($arScanCodes);
         //$arDetection[$lang_from_to][] = $deviation[1];
         //$arDetection[$lang_from_to][] = intval($deviation[0]*100);
         //Delay till compare
         $arDetectionFrom[$lang][] = $ob;
         $arDetectionFrom[$lang][] = $arScanCodes;
         $arDetectionFrom[$lang][] = $i;
         $i++;
     }
     uasort($arDetectionFrom, array("CSearchLanguage", "cmp"));
     //echo "<pre>";foreach($arDetectionFrom as $i=>$ar){var_dump($i); print_r(array($ar[0],$ar[1],$ar[3],$ar[4],));}echo "<pre>";
     //Now try the best to detect the language
     $arDetection = array();
     $i = 0;
     foreach ($arDetectionFrom as $lang_from => $arTemp) {
         $arScanCodes = $languages_from[$lang_from];
         foreach ($arLanguages as $lang) {
             $lang_from_to = $lang_from . "=>" . $lang;
             $arDetection[$lang_from_to] = array();
             $ob = CSearchLanguage::GetLanguage($lang);
             $arDetection[$lang_from_to][] = $ob->HasBigrammInfo();
             $arDetection[$lang_from_to][] = $ob->CheckTrigrams($arScanCodes);
             //Calculate how far sequence of scan codes
             //is from language model
             //$deviation = $ob->GetDeviation($arScanCodes);
             //$arDetection[$lang_from_to][] = $deviation[1];
             //$arDetection[$lang_from_to][] = intval($deviation[0]*100);
             //Delay till compare
             $arDetection[$lang_from_to][] = $ob;
             $arDetection[$lang_from_to][] = $arScanCodes;
             $alt_text = CSearchLanguage::ConvertKeyboardLayout($text, $lang_from, $lang);
             $arDetection[$lang_from_to][] = $alt_text !== $text;
             $arDetection[$lang_from_to][] = $i;
             $arDetection[$lang_from_to][] = $lang_from_to;
             $i++;
         }
     }
     uasort($arDetection, array("CSearchLanguage", "cmp"));
     $language_from_to = key($arDetection);
     list($language_from, $language_to) = explode("=>", $language_from_to);
     //echo "<pre>";foreach($arDetection as $i=>$ar){var_dump($i); print_r(array($ar[0],$ar[1],$ar[3],$ar[4],$ar[5],));}echo "<pre>";
     $alt_text = CSearchLanguage::ConvertKeyboardLayout($text, $language_from, $language_to);
     if ($alt_text === $text) {
         return false;
     }
     return array("from" => $language_from, "to" => $language_to);
 }