Пример #1
0
 /**
  * Main function to get the STEM of a word
  * The word in param MUST BE IN UTF-8
  *
  * @param string $word
  * @throws \Exception
  * @return NULL|string
  */
 public function stem($word)
 {
     // we do ALL in UTF-8
     if (!Utf8::check($word)) {
         throw new \Exception('Word must be in UTF-8');
         return null;
     }
     $this->word = Utf8::strtolower($word);
     $this->rv();
     $this->r1();
     $this->r2();
     $this->step0();
     $word = $this->word;
     $this->step1();
     // Do step 2a if no ending was removed by step 1.
     if ($this->word == $word) {
         $this->step2a();
         // Do Step 2b if step 2a was done, but failed to remove a suffix.
         if ($this->word == $word) {
             $this->step2b();
         }
     }
     $this->step3();
     $this->finish();
     return $this->word;
 }
Пример #2
0
 /**
  * Main function to get the STEM of a word
  * The word in param MUST BE IN UTF-8
  *
  * @param string $word
  * @throws \Exception
  * @return NULL|string
  */
 public function stem($word)
 {
     // we do ALL in UTF-8
     if (!Utf8::check($word)) {
         throw new \Exception('Word must be in UTF-8');
         return null;
     }
     $this->word = Utf8::strtolower($word);
     // First, remove all umlaut and acute accents.
     $this->word = Utf8::str_replace(array('ä', 'ë', 'ï', 'ö', 'ü', 'á', 'é', 'í', 'ó', 'ú'), array('a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u'), $this->word);
     $this->plainVowels = implode('', self::$vowels);
     // Put initial y, y after a vowel, and i between vowels into upper case.
     $this->word = preg_replace('#^y#u', 'Y', $this->word);
     $this->word = preg_replace('#([' . $this->plainVowels . '])y#u', '$1Y', $this->word);
     $this->word = preg_replace('#([' . $this->plainVowels . '])i([' . $this->plainVowels . '])#u', '$1I$2', $this->word);
     // R1 and R2 (see the note on R1 and R2) are then defined as in German.
     // R1 and R2 are first set up in the standard way
     $this->r1();
     $this->r2();
     // but then R1 is adjusted so that the region before it contains at least 3 letters.
     if ($this->r1Index < 3) {
         $this->r1Index = 3;
         $this->r1 = Utf8::substr($this->word, 3);
     }
     // Do each of steps 1, 2 3 and 4.
     $this->step1();
     $removedE = $this->step2();
     $this->step3a();
     $this->step3b($removedE);
     $this->step4();
     $this->finish();
     return $this->word;
 }
Пример #3
0
 /**
  * Main function to get the STEM of a word
  * The word in param MUST BE IN UTF-8
  *
  * @param string $word
  * @throws \Exception
  * @return NULL|string
  */
 public function stem($word)
 {
     // we do ALL in UTF-8
     if (!Utf8::check($word)) {
         throw new \Exception('Word must be in UTF-8');
         return null;
     }
     $this->word = Utf8::strtolower($word);
     $this->word = Utf8::str_replace(array('ã', 'õ'), array('a~', 'o~'), $this->word);
     $this->rv();
     $this->r1();
     $this->r2();
     $word = $this->word;
     $this->step1();
     if ($word == $this->word) {
         $this->step2();
     }
     if ($word != $this->word) {
         $this->step3();
     } else {
         $this->step4();
     }
     $this->step5();
     $this->finish();
     return $this->word;
 }
Пример #4
0
 /**
  * Main function to get the STEM of a word
  * The word in param MUST BE IN UTF-8
  *
  * @param string $word
  * @throws \Exception
  * @return NULL|string
  */
 public function stem($word)
 {
     // we do ALL in UTF-8
     if (!Utf8::check($word)) {
         throw new \Exception('Word must be in UTF-8');
         return null;
     }
     $this->plainVowels = implode('', self::$vowels);
     $this->word = Utf8::strtolower($word);
     // First, replace all acute accents by grave accents.
     $this->word = Utf8::str_replace(array('á', 'é', 'í', 'ó', 'ú'), array('à', 'è', 'ì', 'ò', 'ù'), $this->word);
     //And, as in French, put u after q, and u, i between vowels into upper case. (See note on vowel marking.) The vowels are then
     $this->word = preg_replace('#([q])u#u', '$1U', $this->word);
     $this->word = preg_replace('#([' . $this->plainVowels . '])u([' . $this->plainVowels . '])#u', '$1U$2', $this->word);
     $this->word = preg_replace('#([' . $this->plainVowels . '])i([' . $this->plainVowels . '])#u', '$1I$2', $this->word);
     $this->rv();
     $this->r1();
     $this->r2();
     $this->step0();
     $word = $this->word;
     $this->step1();
     //Do step 2 if no ending was removed by step 1.
     if ($word == $this->word) {
         $this->step2();
     }
     $this->step3a();
     $this->step3b();
     $this->finish();
     return $this->word;
 }
Пример #5
0
 /**
  * Main function to get the STEM of a word
  * The word in param MUST BE IN UTF-8
  *
  * @param string $word
  * @throws \Exception
  * @return NULL|string
  */
 public function stem($word)
 {
     // we do ALL in UTF-8
     if (!Utf8::check($word)) {
         throw new \Exception('Word must be in UTF-8');
         return null;
     }
     $this->word = Utf8::strtolower($word);
     $this->plainVowels = implode('', self::$vowels);
     //  First, i and u between vowels are put into upper case (so that they are treated as consonants).
     $this->word = preg_replace('#([' . $this->plainVowels . '])u([' . $this->plainVowels . '])#u', '$1U$2', $this->word);
     $this->word = preg_replace('#([' . $this->plainVowels . '])i([' . $this->plainVowels . '])#u', '$1I$2', $this->word);
     $this->rv();
     $this->r1();
     $this->r2();
     $this->step0();
     $word1 = $this->word;
     $word2 = $this->word;
     do {
         $word1 = $this->word;
         $this->step1();
     } while ($this->word != $word1);
     $this->step2();
     // Do step 3 if no suffix was removed either by step 1 or step 2.
     if ($word2 == $this->word) {
         $this->step3();
     }
     $this->step4();
     $this->finish();
     return $this->word;
 }
Пример #6
0
 /**
  * Main function to get the STEM of a word
  * The word in param MUST BE IN UTF-8
  *
  * @param string $word
  * @throws \Exception
  * @return NULL|string
  */
 public function stem($word)
 {
     // we do ALL in UTF-8
     if (!Utf8::check($word)) {
         throw new \Exception('Word must be in UTF-8');
         return null;
     }
     $this->plainVowels = implode('', self::$vowels);
     $this->word = Utf8::strtolower($word);
     // First, replace ß by ss
     $this->word = Utf8::str_replace('ß', 'ss', $this->word);
     // put u and y between vowels into upper case
     $this->word = preg_replace('#([' . $this->plainVowels . '])y([' . $this->plainVowels . '])#u', '$1Y$2', $this->word);
     $this->word = preg_replace('#([' . $this->plainVowels . '])u([' . $this->plainVowels . '])#u', '$1U$2', $this->word);
     //  R1 and R2 are first set up in the standard way
     $this->r1();
     $this->r2();
     // but then R1 is adjusted so that the region before it contains at least 3 letters.
     if ($this->r1Index < 3) {
         $this->r1Index = 3;
         $this->r1 = Utf8::substr($this->word, 3);
     }
     $this->step1();
     $this->step2();
     $this->step3();
     $this->finish();
     return $this->word;
 }
Пример #7
0
 /**
  * Main function to get the STEM of a word
  * The word in param MUST BE IN UTF-8
  *
  * @param string $word
  * @throws \Exception
  * @return NULL|string
  */
 public function stem($word)
 {
     // we do ALL in UTF-8
     if (!Utf8::check($word)) {
         throw new \Exception('Word must be in UTF-8');
         return null;
     }
     $this->word = Utf8::strtolower($word);
     $this->plainVowels = implode('', self::$vowels);
     $this->step0();
     $this->rv();
     $this->r1();
     $this->r2();
     // to know if step1, 2a or 2b have altered the word
     $this->originalWord = $this->word;
     $nextStep = $this->step1();
     // Do step 2a if either no ending was removed by step 1, or if one of endings amment, emment, ment, ments was found.
     if ($nextStep == 2 || $this->originalWord == $this->word) {
         $modified = $this->step2a();
         if (!$modified) {
             $this->step2b();
         }
     }
     if ($this->word != $this->originalWord) {
         $this->step3();
     } else {
         $this->step4();
     }
     $this->step5();
     $this->step6();
     $this->finish();
     return $this->word;
 }
Пример #8
0
 /**
  * Main function to get the STEM of a word
  * The word in param MUST BE IN UTF-8
  *
  * @param string $word
  * @throws \Exception
  * @return NULL|string
  */
 public function stem($word)
 {
     // we do ALL in UTF-8
     if (!Utf8::check($word)) {
         throw new \Exception('Word must be in UTF-8');
         return null;
     }
     if (Utf8::strlen($word) < 3) {
         return $word;
     }
     $this->word = Utf8::strtolower($word);
     // exceptions
     if (null !== ($word = $this->exception1())) {
         return $word;
     }
     $this->plainVowels = implode('', self::$vowels);
     // Remove initial ', if present.
     $first = Utf8::substr($this->word, 0, 1);
     if ($first == "'") {
         $this->word = Utf8::substr($this->word, 1);
     }
     // Set initial y, or y after a vowel, to Y
     if ($first == 'y') {
         $this->word = preg_replace('#^y#u', 'Y', $this->word);
     }
     $this->word = preg_replace('#([' . $this->plainVowels . '])y#u', '$1Y', $this->word);
     $this->r1();
     $this->exceptionR1();
     $this->r2();
     $this->step0();
     $this->step1a();
     // exceptions 2
     if (null !== ($word = $this->exception2())) {
         return $word;
     }
     $this->step1b();
     $this->step1c();
     $this->step2();
     $this->step3();
     $this->step4();
     $this->step5();
     $this->finish();
     return $this->word;
 }
Пример #9
0
 /**
  * Main function to get the STEM of a word
  * The word in param MUST BE IN UTF-8
  *
  * @param string $word
  * @throws \Exception
  * @return NULL|string
  */
 public function stem($word)
 {
     // we do ALL in UTF-8
     if (!Utf8::check($word)) {
         throw new \Exception('Word must be in UTF-8');
         return null;
     }
     $this->word = Utf8::strtolower($word);
     // R2 is not used: R1 is defined in the same way as in the German stemmer
     $this->r1();
     // then R1 is adjusted so that the region before it contains at least 3 letters.
     if ($this->r1Index < 3) {
         $this->r1Index = 3;
         $this->r1 = Utf8::substr($this->word, 3);
     }
     // Do each of steps 1, 2 3 and 4.
     $this->step1();
     $this->step2();
     $this->step3();
     return $this->word;
 }
Пример #10
0
 static function strtolower($s)
 {
     if (false !== strpos($s, 'İ')) {
         $s = str_replace('İ', 'i', $s);
     }
     if (false !== strpos($s, 'I')) {
         $s = str_replace('I', 'ı', $s);
     }
     return parent::strtolower($s);
 }