Ejemplo n.º 1
0
 public function computeProbabilities($text)
 {
     $input_text = new CStringUTF8();
     $input_text->utf8_TextToArray($text);
     // pre-process the text
     $input_text->stripUserTags();
     $input_text->stripHashTags();
     $input_text->stripURLs();
     $input_text->stripUnusedChars();
     $input_text->stripMultipleSeparator();
     $input_text->uppercaseToLowercase();
     // create a histogram of characters uni-grams
     $histogram = $input_text->histogramNgramChars(1);
     // compute the probabilities of the languages (sum of frequencies)
     $probabilities = array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
     for ($char = 0; $char < count($histogram->m_vector); $char++) {
         // Chinese characters
         if ($this->uniord($histogram->m_vector[$char]->m_element[0]) >= 0xe4b8a5 && $this->uniord($histogram->m_vector[$char]->m_element[0]) <= 0xe9bea0) {
             $probabilities[LNG_CHINESE] += $histogram->m_vector[$char]->m_frequency;
         }
         for ($language = 0; $language < NUMBER_LANGUAGES; $language++) {
             if ($language != LNG_CHINESE) {
                 for ($char_ref = 0; $char_ref < count($this->m_language_profiles[$language]->m_vector); $char_ref++) {
                     if (utf8_strcmp($this->m_language_profiles[$language]->m_vector[$char_ref]->m_element, $histogram->m_vector[$char]->m_element)) {
                         $probabilities[$language] += $histogram->m_vector[$char]->m_frequency;
                         break;
                     }
                 }
             }
         }
     }
     return $probabilities;
 }
Ejemplo n.º 2
0
 public function computeProbabilities($text)
 {
     $input_text = new CStringUTF8();
     $input_text->utf8_TextToArray($text);
     // pre-process the text
     $input_text->stripUserTags();
     $input_text->stripHashTags();
     $input_text->stripURLs();
     $input_text->stripUnusedChars();
     $input_text->separateWords();
     $input_text->stripMultipleSeparator();
     $input_text->uppercaseToLowercase();
     // extract a list of words from the text
     $words = $input_text->getWords();
     // compute the probabilities of the languages (sum of frequencies)
     $probabilities = array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
     for ($word = 0; $word < count($words); $word++) {
         for ($language = 0; $language < NUMBER_LANGUAGES; $language++) {
             for ($word_ref = 0; $word_ref < count($this->m_language_profiles[$language]->m_vector); $word_ref++) {
                 if (utf8_strcmp($this->m_language_profiles[$language]->m_vector[$word_ref]->m_element, $words[$word])) {
                     $probabilities[$language]++;
                     break;
                 }
             }
         }
     }
     return $probabilities;
 }
 public function histogramNgramChars($NGRAM)
 {
     $histogram = new CHistogram();
     for ($char = 1; $char < count($this->m_string) - ($NGRAM - 1); $char++) {
         // copy the n-gram element into a temporary variable
         $tmp = array();
         for ($c = 0; $c < $NGRAM; $c++) {
             $tmp[] = $this->m_string[$char + $c];
         }
         // looking for the existence of the n-gram element in the histogram
         $exists = false;
         for ($elt = 0; $elt < count($histogram->m_vector); $elt++) {
             // test if the temporary n-gram element matches the current histogram element
             // if it exists in the histogram its frequency is incremented, and we break the loop
             if (utf8_strcmp($histogram->m_vector[$elt]->m_element, $tmp)) {
                 $histogram->m_vector[$elt]->m_frequency++;
                 $exists = true;
                 break;
             }
         }
         // if the temporary n-gram element does not exist
         if (!$exists) {
             $histogram->m_vector[] = new HistogramElement($tmp, 1);
         }
     }
     return $histogram;
 }
Ejemplo n.º 4
0
 public function languageIdentification($text, $promising_class)
 {
     $input_text = new CStringUTF8();
     $input_text->utf8_TextToArray($text);
     // pre-process the text
     $input_text->stripUserTags();
     $input_text->stripHashTags();
     $input_text->stripURLs();
     $input_text->stripUnusedChars();
     $input_text->stripMultipleSeparator();
     $input_text->uppercaseToLowercase();
     // create a histogram of characters uni-grams
     $histogram = $input_text->histogramNgramChars(1);
     if ($promising_class == 3) {
         $this->m_language_profiles = new CHistogram();
         $this->m_language_profiles->loadCharsFromFile("./LID_tools/References/Languages/russian.txt");
         $probabilities = 0;
         for ($char = 0; $char < count($histogram->m_vector); $char++) {
             for ($char_ref = 0; $char_ref < count($this->m_language_profiles->m_vector); $char_ref++) {
                 if (utf8_strcmp($this->m_language_profiles->m_vector[$char_ref]->m_element, $histogram->m_vector[$char]->m_element)) {
                     $probabilities += $histogram->m_vector[$char]->m_frequency;
                     break;
                 }
             }
         }
         if ($probabilities > 0) {
             return LNG_RUSSIAN;
         } else {
             return LNG_BULGARIAN;
         }
     } else {
         if ($promising_class == 2) {
             $this->m_language_profiles = array();
             for ($language = 0; $language < 2; $language++) {
                 $this->m_language_profiles[$language] = new CHistogram();
             }
             $this->m_language_profiles[0]->loadCharsFromFile("./LID_tools/References/Languages/persian.txt");
             $this->m_language_profiles[1]->loadCharsFromFile("./LID_tools/References/Languages/urdu.txt");
             $probabilities = array(0, 0);
             for ($language = 0; $language < 2; $language++) {
                 for ($char = 0; $char < count($histogram->m_vector); $char++) {
                     for ($char_ref = 0; $char_ref < count($this->m_language_profiles[$language]->m_vector); $char_ref++) {
                         if (utf8_strcmp($this->m_language_profiles[$language]->m_vector[$char_ref]->m_element, $histogram->m_vector[$char]->m_element)) {
                             $probabilities[$language] += $histogram->m_vector[$char]->m_frequency;
                             break;
                         }
                     }
                 }
             }
             // retrieve the highest probability (sum of frequencies)
             $max = 0;
             // keeps the highest probability
             $promising_language = -1;
             // keeps the promising language
             for ($language = 0; $language < 2; $language++) {
                 if ($probabilities[$language] > $max) {
                     $max = $probabilities[$language];
                     $promising_language = $language;
                 }
             }
             if ($promising_language == 0) {
                 return LNG_PERSIAN;
             } else {
                 if ($promising_language == 1) {
                     return LNG_URDU;
                 } else {
                     return LNG_ARABIC;
                 }
             }
         } else {
             if ($promising_class == 4) {
                 $this->m_language_profiles = array();
                 for ($language = 0; $language < 16; $language++) {
                     $this->m_language_profiles[$language] = new CHistogram();
                 }
                 $this->m_language_profiles[0]->loadCharsFromFile("./LID_tools/References/Languages/german.txt");
                 $this->m_language_profiles[1]->loadCharsFromFile("./LID_tools/References/Languages/swedish.txt");
                 $this->m_language_profiles[2]->loadCharsFromFile("./LID_tools/References/Languages/finnish.txt");
                 $this->m_language_profiles[3]->loadCharsFromFile("./LID_tools/References/Languages/albanian.txt");
                 $this->m_language_profiles[4]->loadCharsFromFile("./LID_tools/References/Languages/french.txt");
                 $this->m_language_profiles[5]->loadCharsFromFile("./LID_tools/References/Languages/irish.txt");
                 $this->m_language_profiles[6]->loadCharsFromFile("./LID_tools/References/Languages/italian.txt");
                 $this->m_language_profiles[7]->loadCharsFromFile("./LID_tools/References/Languages/spanish.txt");
                 $this->m_language_profiles[8]->loadCharsFromFile("./LID_tools/References/Languages/portuguese.txt");
                 $this->m_language_profiles[9]->loadCharsFromFile("./LID_tools/References/Languages/hungarian.txt");
                 $this->m_language_profiles[10]->loadCharsFromFile("./LID_tools/References/Languages/norwegian.txt");
                 $this->m_language_profiles[11]->loadCharsFromFile("./LID_tools/References/Languages/danish.txt");
                 $this->m_language_profiles[12]->loadCharsFromFile("./LID_tools/References/Languages/turkish.txt");
                 $this->m_language_profiles[13]->loadCharsFromFile("./LID_tools/References/Languages/polish.txt");
                 $this->m_language_profiles[14]->loadCharsFromFile("./LID_tools/References/Languages/icelandic.txt");
                 $this->m_language_profiles[15]->loadCharsFromFile("./LID_tools/References/Languages/czech.txt");
                 $probabilities = array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
                 for ($language = 0; $language < 16; $language++) {
                     for ($char = 0; $char < count($histogram->m_vector); $char++) {
                         for ($char_ref = 0; $char_ref < count($this->m_language_profiles[$language]->m_vector); $char_ref++) {
                             if (utf8_strcmp($this->m_language_profiles[$language]->m_vector[$char_ref]->m_element, $histogram->m_vector[$char]->m_element)) {
                                 $probabilities[$language] += $histogram->m_vector[$char]->m_frequency;
                                 break;
                             }
                         }
                     }
                 }
                 // retrieve the highest probability (sum of frequencies)
                 $max = 0;
                 // keeps the highest probability
                 $promising_language = -1;
                 // keeps the promising language
                 for ($language = 0; $language < 16; $language++) {
                     if ($probabilities[$language] > $max) {
                         $max = $probabilities[$language];
                         $promising_language = $language;
                     }
                 }
                 if ($promising_language == 2) {
                     return LNG_FINNISH;
                 } else {
                     if ($promising_language == 1) {
                         return LNG_SWEDISH;
                     } else {
                         if ($promising_language == 0) {
                             return LNG_GERMAN;
                         } else {
                             if ($promising_language == 3) {
                                 return LNG_ALBANIAN;
                             } else {
                                 if ($promising_language == 4) {
                                     return LNG_FRENCH;
                                 } else {
                                     if ($promising_language == 5) {
                                         return LNG_IRISH;
                                     } else {
                                         if ($promising_language == 6) {
                                             return LNG_ITALIAN;
                                         } else {
                                             if ($promising_language == 7) {
                                                 return LNG_SPANISH;
                                             } else {
                                                 if ($promising_language == 8) {
                                                     return LNG_PORTUGUESE;
                                                 } else {
                                                     if ($promising_language == 9) {
                                                         return LNG_HUNGARIAN;
                                                     } else {
                                                         if ($promising_language == 10) {
                                                             return LNG_NORWEGIAN;
                                                         } else {
                                                             if ($promising_language == 11) {
                                                                 return LNG_DANISH;
                                                             } else {
                                                                 if ($promising_language == 12) {
                                                                     return LNG_TURKISH;
                                                                 } else {
                                                                     if ($promising_language == 13) {
                                                                         return LNG_POLISH;
                                                                     } else {
                                                                         if ($promising_language == 14) {
                                                                             return LNG_ICELANDIC;
                                                                         } else {
                                                                             if ($promising_language == 15) {
                                                                                 return LNG_CZECH;
                                                                             } else {
                                                                                 return LNG_ENGLISH;
                                                                             }
                                                                         }
                                                                     }
                                                                 }
                                                             }
                                                         }
                                                     }
                                                 }
                                             }
                                         }
                                     }
                                 }
                             }
                         }
                     }
                 }
             }
         }
     }
 }