/** * Main function to get the STEM of a word * The word in param MUST BE IN UTF-8 * * @param string $word * @throws \Exception * @return NULL|string */ public function stem($word) { // we do ALL in UTF-8 if (!Utf8::check($word)) { throw new \Exception('Word must be in UTF-8'); return null; } $this->word = Utf8::strtolower($word); $this->rv(); $this->r1(); $this->r2(); $this->step0(); $word = $this->word; $this->step1(); // Do step 2a if no ending was removed by step 1. if ($this->word == $word) { $this->step2a(); // Do Step 2b if step 2a was done, but failed to remove a suffix. if ($this->word == $word) { $this->step2b(); } } $this->step3(); $this->finish(); return $this->word; }
/** * Main function to get the STEM of a word * The word in param MUST BE IN UTF-8 * * @param string $word * @throws \Exception * @return NULL|string */ public function stem($word) { // we do ALL in UTF-8 if (!Utf8::check($word)) { throw new \Exception('Word must be in UTF-8'); return null; } $this->word = Utf8::strtolower($word); // First, remove all umlaut and acute accents. $this->word = Utf8::str_replace(array('ä', 'ë', 'ï', 'ö', 'ü', 'á', 'é', 'í', 'ó', 'ú'), array('a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u'), $this->word); $this->plainVowels = implode('', self::$vowels); // Put initial y, y after a vowel, and i between vowels into upper case. $this->word = preg_replace('#^y#u', 'Y', $this->word); $this->word = preg_replace('#([' . $this->plainVowels . '])y#u', '$1Y', $this->word); $this->word = preg_replace('#([' . $this->plainVowels . '])i([' . $this->plainVowels . '])#u', '$1I$2', $this->word); // R1 and R2 (see the note on R1 and R2) are then defined as in German. // R1 and R2 are first set up in the standard way $this->r1(); $this->r2(); // but then R1 is adjusted so that the region before it contains at least 3 letters. if ($this->r1Index < 3) { $this->r1Index = 3; $this->r1 = Utf8::substr($this->word, 3); } // Do each of steps 1, 2 3 and 4. $this->step1(); $removedE = $this->step2(); $this->step3a(); $this->step3b($removedE); $this->step4(); $this->finish(); return $this->word; }
/** * Main function to get the STEM of a word * The word in param MUST BE IN UTF-8 * * @param string $word * @throws \Exception * @return NULL|string */ public function stem($word) { // we do ALL in UTF-8 if (!Utf8::check($word)) { throw new \Exception('Word must be in UTF-8'); return null; } $this->word = Utf8::strtolower($word); $this->word = Utf8::str_replace(array('ã', 'õ'), array('a~', 'o~'), $this->word); $this->rv(); $this->r1(); $this->r2(); $word = $this->word; $this->step1(); if ($word == $this->word) { $this->step2(); } if ($word != $this->word) { $this->step3(); } else { $this->step4(); } $this->step5(); $this->finish(); return $this->word; }
/** * Main function to get the STEM of a word * The word in param MUST BE IN UTF-8 * * @param string $word * @throws \Exception * @return NULL|string */ public function stem($word) { // we do ALL in UTF-8 if (!Utf8::check($word)) { throw new \Exception('Word must be in UTF-8'); return null; } $this->plainVowels = implode('', self::$vowels); $this->word = Utf8::strtolower($word); // First, replace all acute accents by grave accents. $this->word = Utf8::str_replace(array('á', 'é', 'í', 'ó', 'ú'), array('à', 'è', 'ì', 'ò', 'ù'), $this->word); //And, as in French, put u after q, and u, i between vowels into upper case. (See note on vowel marking.) The vowels are then $this->word = preg_replace('#([q])u#u', '$1U', $this->word); $this->word = preg_replace('#([' . $this->plainVowels . '])u([' . $this->plainVowels . '])#u', '$1U$2', $this->word); $this->word = preg_replace('#([' . $this->plainVowels . '])i([' . $this->plainVowels . '])#u', '$1I$2', $this->word); $this->rv(); $this->r1(); $this->r2(); $this->step0(); $word = $this->word; $this->step1(); //Do step 2 if no ending was removed by step 1. if ($word == $this->word) { $this->step2(); } $this->step3a(); $this->step3b(); $this->finish(); return $this->word; }
/** * Main function to get the STEM of a word * The word in param MUST BE IN UTF-8 * * @param string $word * @throws \Exception * @return NULL|string */ public function stem($word) { // we do ALL in UTF-8 if (!Utf8::check($word)) { throw new \Exception('Word must be in UTF-8'); return null; } $this->word = Utf8::strtolower($word); $this->plainVowels = implode('', self::$vowels); // First, i and u between vowels are put into upper case (so that they are treated as consonants). $this->word = preg_replace('#([' . $this->plainVowels . '])u([' . $this->plainVowels . '])#u', '$1U$2', $this->word); $this->word = preg_replace('#([' . $this->plainVowels . '])i([' . $this->plainVowels . '])#u', '$1I$2', $this->word); $this->rv(); $this->r1(); $this->r2(); $this->step0(); $word1 = $this->word; $word2 = $this->word; do { $word1 = $this->word; $this->step1(); } while ($this->word != $word1); $this->step2(); // Do step 3 if no suffix was removed either by step 1 or step 2. if ($word2 == $this->word) { $this->step3(); } $this->step4(); $this->finish(); return $this->word; }
/** * Main function to get the STEM of a word * The word in param MUST BE IN UTF-8 * * @param string $word * @throws \Exception * @return NULL|string */ public function stem($word) { // we do ALL in UTF-8 if (!Utf8::check($word)) { throw new \Exception('Word must be in UTF-8'); return null; } $this->plainVowels = implode('', self::$vowels); $this->word = Utf8::strtolower($word); // First, replace ß by ss $this->word = Utf8::str_replace('ß', 'ss', $this->word); // put u and y between vowels into upper case $this->word = preg_replace('#([' . $this->plainVowels . '])y([' . $this->plainVowels . '])#u', '$1Y$2', $this->word); $this->word = preg_replace('#([' . $this->plainVowels . '])u([' . $this->plainVowels . '])#u', '$1U$2', $this->word); // R1 and R2 are first set up in the standard way $this->r1(); $this->r2(); // but then R1 is adjusted so that the region before it contains at least 3 letters. if ($this->r1Index < 3) { $this->r1Index = 3; $this->r1 = Utf8::substr($this->word, 3); } $this->step1(); $this->step2(); $this->step3(); $this->finish(); return $this->word; }
/** * Main function to get the STEM of a word * The word in param MUST BE IN UTF-8 * * @param string $word * @throws \Exception * @return NULL|string */ public function stem($word) { // we do ALL in UTF-8 if (!Utf8::check($word)) { throw new \Exception('Word must be in UTF-8'); return null; } $this->word = Utf8::strtolower($word); $this->plainVowels = implode('', self::$vowels); $this->step0(); $this->rv(); $this->r1(); $this->r2(); // to know if step1, 2a or 2b have altered the word $this->originalWord = $this->word; $nextStep = $this->step1(); // Do step 2a if either no ending was removed by step 1, or if one of endings amment, emment, ment, ments was found. if ($nextStep == 2 || $this->originalWord == $this->word) { $modified = $this->step2a(); if (!$modified) { $this->step2b(); } } if ($this->word != $this->originalWord) { $this->step3(); } else { $this->step4(); } $this->step5(); $this->step6(); $this->finish(); return $this->word; }
/** * Main function to get the STEM of a word * The word in param MUST BE IN UTF-8 * * @param string $word * @throws \Exception * @return NULL|string */ public function stem($word) { // we do ALL in UTF-8 if (!Utf8::check($word)) { throw new \Exception('Word must be in UTF-8'); return null; } if (Utf8::strlen($word) < 3) { return $word; } $this->word = Utf8::strtolower($word); // exceptions if (null !== ($word = $this->exception1())) { return $word; } $this->plainVowels = implode('', self::$vowels); // Remove initial ', if present. $first = Utf8::substr($this->word, 0, 1); if ($first == "'") { $this->word = Utf8::substr($this->word, 1); } // Set initial y, or y after a vowel, to Y if ($first == 'y') { $this->word = preg_replace('#^y#u', 'Y', $this->word); } $this->word = preg_replace('#([' . $this->plainVowels . '])y#u', '$1Y', $this->word); $this->r1(); $this->exceptionR1(); $this->r2(); $this->step0(); $this->step1a(); // exceptions 2 if (null !== ($word = $this->exception2())) { return $word; } $this->step1b(); $this->step1c(); $this->step2(); $this->step3(); $this->step4(); $this->step5(); $this->finish(); return $this->word; }
/** * Main function to get the STEM of a word * The word in param MUST BE IN UTF-8 * * @param string $word * @throws \Exception * @return NULL|string */ public function stem($word) { // we do ALL in UTF-8 if (!Utf8::check($word)) { throw new \Exception('Word must be in UTF-8'); return null; } $this->word = Utf8::strtolower($word); // R2 is not used: R1 is defined in the same way as in the German stemmer $this->r1(); // then R1 is adjusted so that the region before it contains at least 3 letters. if ($this->r1Index < 3) { $this->r1Index = 3; $this->r1 = Utf8::substr($this->word, 3); } // Do each of steps 1, 2 3 and 4. $this->step1(); $this->step2(); $this->step3(); return $this->word; }
static function strtolower($s) { if (false !== strpos($s, 'İ')) { $s = str_replace('İ', 'i', $s); } if (false !== strpos($s, 'I')) { $s = str_replace('I', 'ı', $s); } return parent::strtolower($s); }