/** * Define a short syllable in a word as either (a) a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel, * or * (b) a vowel at the beginning of the word followed by a non-vowel. * * So rap, trap, entrap end with a short syllable, and ow, on, at are classed as short syllables. * But uproot, bestow, disturb do not end with a short syllable. */ private function searchShortSyllabe($from, $nbLetters) { $length = Utf8::strlen($this->word); if ($from < 0) { $from = $length + $from; } if ($from < 0) { $from = 0; } // (a) is just for beginning of the word if ($nbLetters == 2 && $from != 0) { return false; } $first = Utf8::substr($this->word, $from, 1); $second = Utf8::substr($this->word, $from + 1, 1); if ($nbLetters == 2) { if (in_array($first, self::$vowels) && !in_array($second, self::$vowels)) { return true; } } $third = Utf8::substr($this->word, $from + 2, 1); if (!in_array($first, self::$vowels) && in_array($second, self::$vowels) && !in_array($third, array_merge(self::$vowels, array('x', 'Y', 'w')))) { return true; } return false; }
/** * Used by spanish, italian, portuguese, etc (but not by french) * * If the second letter is a consonant, RV is the region after the next following vowel, * or if the first two letters are vowels, RV is the region after the next consonant, * and otherwise (consonant-vowel case) RV is the region after the third letter. * But RV is the end of the word if these positions cannot be found. */ protected function rv() { $length = Utf8::strlen($this->word); $this->rv = ''; $this->rvIndex = $length; if ($length < 3) { return true; } $first = Utf8::substr($this->word, 0, 1); $second = Utf8::substr($this->word, 1, 1); // If the second letter is a consonant, RV is the region after the next following vowel, if (!in_array($second, static::$vowels)) { for ($i = 2; $i < $length; $i++) { $letter = Utf8::substr($this->word, $i, 1); if (in_array($letter, static::$vowels)) { $this->rvIndex = $i + 1; $this->rv = Utf8::substr($this->word, $i + 1); return true; } } } // or if the first two letters are vowels, RV is the region after the next consonant, if (in_array($first, static::$vowels) && in_array($second, static::$vowels)) { for ($i = 2; $i < $length; $i++) { $letter = Utf8::substr($this->word, $i, 1); if (!in_array($letter, static::$vowels)) { $this->rvIndex = $i + 1; $this->rv = Utf8::substr($this->word, $i + 1); return true; } } } // and otherwise (consonant-vowel case) RV is the region after the third letter. if (!in_array($first, static::$vowels) && in_array($second, static::$vowels)) { $this->rv = Utf8::substr($this->word, 3); $this->rvIndex = 3; return true; } }
/** * Step 5 */ public function step5() { // If the word ends with one of "e é ê" in RV, delete it, and if preceded by gu (or ci) with the u (or i) in RV, delete the u (or i). if ($this->searchIfInRv(array('e', 'é', 'ê')) !== false) { $this->word = Utf8::substr($this->word, 0, -1); if (($position2 = $this->search(array('gu', 'ci'))) !== false) { if ($this->inRv($position2 + 1)) { $this->word = Utf8::substr($this->word, 0, -1); } } return true; } else { if ($this->search(array('ç')) !== false) { $this->word = preg_replace('#(ç)$#u', 'c', $this->word); return true; } } return false; }
/** * Step 4: undouble vowel * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, * remove one of the vowels from V (for example, maan -> man, brood -> brod). */ private function step4() { // D is a non-vowel other than I $d = Utf8::substr($this->word, -1, 1); if (in_array($d, array_merge(self::$vowels, array('I')))) { return false; } // V is double a, e, o or u $v = Utf8::substr($this->word, -3, 2); if (!in_array($v, array('aa', 'ee', 'oo', 'uu'))) { return false; } $singleV = Utf8::substr($v, 0, 1); // C is a non-vowel $c = Utf8::substr($this->word, -4, 1); if (in_array($c, self::$vowels)) { return false; } $this->word = Utf8::substr($this->word, 0, -4); $this->word .= $c . $singleV . $d; }
/** * Step 3a * Delete a final a, e, i, o, à, è, ì or ò if it is in RV, and a preceding i if it is in RV */ private function step3a() { if ($this->searchIfInRv(array('a', 'e', 'i', 'o', 'à', 'è', 'ì', 'ò')) !== false) { $this->word = Utf8::substr($this->word, 0, -1); if ($this->searchIfInRv(array('i')) !== false) { $this->word = Utf8::substr($this->word, 0, -1); } return true; } return false; }
/** * Step 3: residual suffix * Search for the longest among the following suffixes in RV, and perform the action indicated. */ private function step3() { // os a o á í ó // delete if in RV if (($position = $this->searchIfInRv(array('os', 'a', 'o', 'á', 'í', 'ó'))) != false) { $this->word = Utf8::substr($this->word, 0, $position); return true; } // e é // delete if in RV, and if preceded by gu with the u in RV delete the u if (($position = $this->searchIfInRv(array('e', 'é'))) != false) { $this->word = Utf8::substr($this->word, 0, $position); if (($position2 = $this->searchIfInRv(array('u'))) != false) { $before = Utf8::substr($this->word, $position2 - 1, 1); if (isset($before) && $before == 'g') { $this->word = Utf8::substr($this->word, 0, $position2); return true; } } } return false; }
/** * Step 4: Removal of final vowel */ public function step4() { // Search for the longest among the suffixes "a e i ie ă " and, if it is in RV, delete it. if (($position = $this->search(array('a', 'ie', 'e', 'i', 'ă'))) !== false) { if ($this->inRv($position)) { $this->word = Utf8::substr($this->word, 0, $position); } } return true; }
/** * Step 3: d-suffixes */ public function step3() { // end ung // delete if in R2 // if preceded by ig, delete if in R2 and not preceded by e if (($position = $this->search(array('end', 'ung'))) !== false) { if ($this->inR2($position)) { $this->word = Utf8::substr($this->word, 0, $position); } if (($position2 = $this->search(array('ig'))) !== false) { $before = $position2 - 1; $letter = Utf8::substr($this->word, $before, 1); if ($this->inR2($position2) && $letter != 'e') { $this->word = Utf8::substr($this->word, 0, $position2); } } return true; } // ig ik isch // delete if in R2 and not preceded by e if (($position = $this->search(array('ig', 'ik', 'isch'))) !== false) { $before = $position - 1; $letter = Utf8::substr($this->word, $before, 1); if ($this->inR2($position) && $letter != 'e') { $this->word = Utf8::substr($this->word, 0, $position); } return true; } // lich heit // delete if in R2 // if preceded by er or en, delete if in R1 if (($position = $this->search(array('lich', 'heit'))) != false) { if ($this->inR2($position)) { $this->word = Utf8::substr($this->word, 0, $position); } if (($position2 = $this->search(array('er', 'en'))) !== false) { if ($this->inR1($position2)) { $this->word = Utf8::substr($this->word, 0, $position2); } } return true; } // keit // delete if in R2 // if preceded by lich or ig, delete if in R2 if (($position = $this->search(array('keit'))) != false) { if ($this->inR2($position)) { $this->word = Utf8::substr($this->word, 0, $position); } if (($position2 = $this->search(array('lich', 'ig'))) !== false) { if ($this->inR2($position2)) { $this->word = Utf8::substr($this->word, 0, $position2); } } return true; } return false; }
/** * If the word begins with two vowels, RV is the region after the third letter, * otherwise the region after the first vowel not at the beginning of the word, * or the end of the word if these positions cannot be found. * (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.) */ protected function rv() { $length = Utf8::strlen($this->word); $this->rv = ''; $this->rvIndex = $length; if ($length < 3) { return true; } // If the word begins with two vowels, RV is the region after the third letter $first = Utf8::substr($this->word, 0, 1); $second = Utf8::substr($this->word, 1, 1); if (in_array($first, self::$vowels) && in_array($second, self::$vowels)) { $this->rv = Utf8::substr($this->word, 3); $this->rvIndex = 3; return true; } // (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.) $begin3 = Utf8::substr($this->word, 0, 3); if (in_array($begin3, array('par', 'col', 'tap'))) { $this->rv = Utf8::substr($this->word, 3); $this->rvIndex = 3; return true; } // otherwise the region after the first vowel not at the beginning of the word, for ($i = 1; $i < $length; $i++) { $letter = Utf8::substr($this->word, $i, 1); if (in_array($letter, self::$vowels)) { $this->rv = Utf8::substr($this->word, $i + 1); $this->rvIndex = $i + 1; return true; } } return false; }
/** * Step 3: * Search for the longest among the following suffixes in R1, and if found, delete. */ private function step3() { // leg eleg ig eig lig elig els lov elov slov hetslov if (($position = $this->searchIfInR1(array('hetslov', 'eleg', 'elov', 'slov', 'elig', 'eig', 'lig', 'els', 'lov', 'leg', 'ig'))) !== false) { $this->word = Utf8::substr($this->word, 0, $position); } }
/** * Step 3: * Search for the longest among the following suffixes in R1, and perform the action indicated. */ private function step3() { // lig ig els // delete if (($position = $this->searchIfInR1(array('lig', 'ig', 'els'))) !== false) { $this->word = Utf8::substr($this->word, 0, $position); return true; } // löst // replace with lös if ($this->searchIfInR1(array('löst')) !== false) { $this->word = Utf8::substr($this->word, 0, -1); return true; } // fullt // replace with full if ($this->searchIfInR1(array('fullt')) !== false) { $this->word = Utf8::substr($this->word, 0, -1); return true; } }