function stemming_ru($word, $flags = 0) { global $STEMMING_RU_VOWELS; global $STEMMING_RU_PERFECTIVE_GERUND; global $STEMMING_RU_ADJECTIVAL1; global $STEMMING_RU_ADJECTIVAL2; global $STEMMING_RU_VERB1; global $STEMMING_RU_VERB2; global $STEMMING_RU_NOUN; //There is a 33rd letter, ё (?), but it is rarely used, and we assume it is mapped into е (e). $word = str_replace("Ё", "Е", $word); //Exceptions static $STEMMING_RU_EX = array("БЕЗЕ" => "БЕЗЕ", "БЫЛЬ" => "БЫЛЬ", "МЕНЮ" => "МЕНЮ", "ГРАНАТ" => "ГРАНАТ", "ГРАНИТ" => "ГРАНИТ", "ТЕРМИНАЛ" => "ТЕРМИНАЛ", "ИЛИ" => "ИЛИ", "РУКАВ" => "РУКАВ", "ПРИЕМ" => "ПРИЕМ", "ОХРАНА" => "ОХРАН"); if (isset($STEMMING_RU_EX[$word])) { return $word; } //HERE IS AN ATTEMPT TO STEM RUSSIAN SECOND NAMES BEGINS //http://www.gramma.ru/SPR/?id=2.8 if ($flags & 1) { if (preg_match("/(ОВ|ЕВ)\$/", $word)) { return array(stemming_ru($word . "А"), stemming_ru($word)); } $found = array(); if (preg_match("/(ОВ|ЕВ)(А|У|ЫМ|Е)\$/", $word, $found)) { return array(stemming_ru($word), stemming_ru(substr($word, 0, -strlen($found[2])))); } } //HERE IS AN ATTEMPT TO STEM RUSSIAN SECOND NAMES ENDS //In any word, RV is the region after the first vowel, or the end of the word if it contains no vowel. //All tests take place in the the RV part of the word. $found = array(); if (preg_match("/^(.*?[{$STEMMING_RU_VOWELS}])(.+)\$/" . BX_UTF_PCRE_MODIFIER, $word, $found)) { $rv = $found[2]; $word = $found[1]; } else { return $word; } //Do each of steps 1, 2, 3 and 4. //Step 1: Search for a PERFECTIVE GERUND ending. If one is found remove it, and that is then the end of step 1. if (preg_match($STEMMING_RU_PERFECTIVE_GERUND, $rv, $found)) { switch ($found[0]) { case "АВ": case "АВШИ": case "АВШИСЬ": case "ЯВ": case "ЯВШИ": case "ЯВШИСЬ": $rv = substr($rv, 0, 1 - strlen($found[0])); break; default: $rv = substr($rv, 0, -strlen($found[0])); } } else { $rv = preg_replace("/(СЯ|СЬ)\$/" . BX_UTF_PCRE_MODIFIER, "", $rv); //ADJECTIVAL if (preg_match($STEMMING_RU_ADJECTIVAL1, $rv, $found)) { $rv = substr($rv, 0, -strlen($found[2])); } elseif (preg_match($STEMMING_RU_ADJECTIVAL2, $rv, $found)) { $rv = substr($rv, 0, -strlen($found[0])); } elseif (preg_match($STEMMING_RU_VERB1, $rv, $found)) { $rv = substr($rv, 0, -strlen($found[2])); } elseif (preg_match($STEMMING_RU_VERB2, $rv, $found)) { $rv = substr($rv, 0, -strlen($found[0])); } else { $rv = preg_replace($STEMMING_RU_NOUN, "", $rv); } } //Step 2: If the word ends with и (i), remove it. if (substr($rv, -1) == "И") { $rv = substr($rv, 0, -1); } //Step 3: Search for a DERIVATIONAL ending in R2 (i.e. the entire ending must lie in R2), and if one is found, remove it. //R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel. if (preg_match("/(ОСТЬ|ОСТ)\$/" . BX_UTF_PCRE_MODIFIER, $rv)) { $R1 = 0; $rv_len = strlen($rv); while ($R1 < $rv_len && strpos($STEMMING_RU_VOWELS, substr($rv, $R1, 1)) !== false) { $R1++; } if ($R1 < $rv_len) { $R1++; } //R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel. $R2 = $R1; while ($R2 < $rv_len && strpos($STEMMING_RU_VOWELS, substr($rv, $R2, 1)) === false) { $R2++; } while ($R2 < $rv_len && strpos($STEMMING_RU_VOWELS, substr($rv, $R2, 1)) !== false) { $R2++; } if ($R2 < $rv_len) { $R2++; } //"ОСТЬ", "ОСТ" if (substr($rv, -4) == "ОСТЬ" && $rv_len >= $R2 + 4) { $rv = substr($rv, 0, $rv_len - 4); } elseif (substr($rv, -3) == "ОСТ" && $rv_len >= $R2 + 3) { $rv = substr($rv, 0, $rv_len - 3); } } //Step 4: (1) Undouble н (n), or, (2) if the word ends with a SUPERLATIVE ending, remove it and undouble н (n), or (3) if the word ends ь (') (soft sign) remove it. $rv = preg_replace("/(ЕЙШЕ|ЕЙШ)\$/" . BX_UTF_PCRE_MODIFIER, "", $rv); $r = preg_replace("/НН\$/" . BX_UTF_PCRE_MODIFIER, "Н", $rv); if ($r == $rv) { $rv = preg_replace("/Ь\$/" . BX_UTF_PCRE_MODIFIER, "", $rv); } else { $rv = $r; } return $word . $rv; }
function stemming_ru($word, $flags = 0) { global $STEMMING_RU_VOWELS; global $STEMMING_RU_PERFECTIVE_GERUND; global $STEMMING_RU_ADJECTIVAL1; global $STEMMING_RU_ADJECTIVAL2; global $STEMMING_RU_VERB1; global $STEMMING_RU_VERB2; global $STEMMING_RU_NOUN; //There is a 33rd letter, ╦ (?), but it is rarely used, and we assume it is mapped into Е (e). $word = str_replace("╗", "е", $word); //Exceptions static $STEMMING_RU_EX = array("аеге" => true, "ашкэ" => true, "лемч" => true, "цпюмюр" => true, "цпюмхр" => true, "реплхмюк" => true, "хкх" => true, "псйюб" => true, "опхел" => true); if (isset($STEMMING_RU_EX[$word])) { return $word; } //HERE IS AN ATTEMPT TO STEM RUSSIAN SECOND NAMES BEGINS //http://www.gramma.ru/SPR/?id=2.8 if ($flags & 1) { if (preg_match("/(нб|еб)\$/", $word)) { return array(stemming_ru($word . "ю"), stemming_ru($word)); } if (preg_match("/(нб|еб)(ю|с|шл|е)\$/", $word, $found)) { return array(stemming_ru($word), stemming_ru(substr($word, 0, -strlen($found[2])))); } } //HERE IS AN ATTEMPT TO STEM RUSSIAN SECOND NAMES ENDS //In any word, RV is the region after the first vowel, or the end of the word if it contains no vowel. //All tests take place in the the RV part of the word. $found = array(); if (preg_match("/^(.*?[{$STEMMING_RU_VOWELS}])(.+)\$/" . BX_UTF_PCRE_MODIFIER, $word, $found)) { $rv = $found[2]; $word = $found[1]; } else { return $word; } //Do each of steps 1, 2, 3 and 4. //Step 1: Search for a PERFECTIVE GERUND ending. If one is found remove it, and that is then the end of step 1. if (preg_match($STEMMING_RU_PERFECTIVE_GERUND, $rv, $found)) { switch ($found[0]) { case "юб": case "юбьх": case "юбьхяэ": case "ъб": case "ъбьх": case "ъбьхяэ": $rv = substr($rv, 0, 1 - strlen($found[0])); break; default: $rv = substr($rv, 0, -strlen($found[0])); } } else { $rv = preg_replace("/(яъ|яэ)\$/" . BX_UTF_PCRE_MODIFIER, "", $rv); //ADJECTIVAL if (preg_match($STEMMING_RU_ADJECTIVAL1, $rv, $found)) { $rv = substr($rv, 0, -strlen($found[2])); } elseif (preg_match($STEMMING_RU_ADJECTIVAL2, $rv, $found)) { $rv = substr($rv, 0, -strlen($found[0])); } elseif (preg_match($STEMMING_RU_VERB1, $rv, $found)) { $rv = substr($rv, 0, -strlen($found[2])); } elseif (preg_match($STEMMING_RU_VERB2, $rv, $found)) { $rv = substr($rv, 0, -strlen($found[0])); } else { $rv = preg_replace($STEMMING_RU_NOUN, "", $rv); } } //Step 2: If the word ends with Х (i), remove it. if (substr($rv, -1) == "х") { $rv = substr($rv, 0, -1); } //Step 3: Search for a DERIVATIONAL ending in R2 (i.e. the entire ending must lie in R2), and if one is found, remove it. //R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel. if (preg_match("/(нярэ|няр)\$/" . BX_UTF_PCRE_MODIFIER, $rv)) { $R1 = 0; $rv_len = strlen($rv); while ($R1 < $rv_len && strpos($STEMMING_RU_VOWELS, substr($rv, $R1, 1)) !== false) { $R1++; } if ($R1 < $rv_len) { $R1++; } //R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel. $R2 = $R1; while ($R2 < $rv_len && strpos($STEMMING_RU_VOWELS, substr($rv, $R2, 1)) === false) { $R2++; } while ($R2 < $rv_len && strpos($STEMMING_RU_VOWELS, substr($rv, $R2, 1)) !== false) { $R2++; } if ($R2 < $rv_len) { $R2++; } //"нярэ", "няр" if (substr($rv, -4) == "нярэ" && $rv_len >= $R2 + 4) { $rv = substr($rv, 0, $rv_len - 4); } elseif (substr($rv, -3) == "няр" && $rv_len >= $R2 + 3) { $rv = substr($rv, 0, $rv_len - 3); } } //Step 4: (1) Undouble М (n), or, (2) if the word ends with a SUPERLATIVE ending, remove it and undouble М (n), or (3) if the word ends Э (') (soft sign) remove it. $rv = preg_replace("/(еиье|еиь)\$/" . BX_UTF_PCRE_MODIFIER, "", $rv); $r = preg_replace("/мм\$/" . BX_UTF_PCRE_MODIFIER, "м", $rv); if ($r == $rv) { $rv = preg_replace("/э\$/" . BX_UTF_PCRE_MODIFIER, "", $rv); } else { $rv = $r; } return $word . $rv; }
function stemming_ru($word, $flags = 0) { global $STEMMING_RU_VOWELS; global $STEMMING_RU_PERFECTIVE_GERUND; global $STEMMING_RU_ADJECTIVAL1; global $STEMMING_RU_ADJECTIVAL2; global $STEMMING_RU_VERB1; global $STEMMING_RU_VERB2; global $STEMMING_RU_NOUN; //There is a 33rd letter, С‘ (?), but it is rarely used, and we assume it is mapped into Рµ (e). $word = str_replace("РЃ", "Р•", $word); //Exceptions static $STEMMING_RU_EX = array("БЕЗЕ" => true, "БЫЛЬ" => true, "МЕНЮ" => true, "ГРАНАТ" => true, "ГРАНР?Рў" => true, "ТЕРМР?РќРђР›" => true, "Р?Р›Р?" => true, "Р РЈРљРђР’" => true, "РџР Р?ЕМ" => true); if (isset($STEMMING_RU_EX[$word])) { return $word; } //HERE IS AN ATTEMPT TO STEM RUSSIAN SECOND NAMES BEGINS //http://www.gramma.ru/SPR/?id=2.8 if ($flags & 1) { if (preg_match("/(РћР’|ЕВ)\$/", $word)) { return array(stemming_ru($word . "Рђ"), stemming_ru($word)); } if (preg_match("/(РћР’|ЕВ)(Рђ|РЈ|ЫМ|Р•)\$/", $word, $found)) { return array(stemming_ru($word), stemming_ru(substr($word, 0, -strlen($found[2])))); } } //HERE IS AN ATTEMPT TO STEM RUSSIAN SECOND NAMES ENDS //In any word, RV is the region after the first vowel, or the end of the word if it contains no vowel. //All tests take place in the the RV part of the word. $found = array(); if (preg_match("/^(.*?[{$STEMMING_RU_VOWELS}])(.+)\$/" . BX_UTF_PCRE_MODIFIER, $word, $found)) { $rv = $found[2]; $word = $found[1]; } else { return $word; } //Do each of steps 1, 2, 3 and 4. //Step 1: Search for a PERFECTIVE GERUND ending. If one is found remove it, and that is then the end of step 1. if (preg_match($STEMMING_RU_PERFECTIVE_GERUND, $rv, $found)) { switch ($found[0]) { case "РђР’": case "РђР’РЁР?": case "РђР’РЁР?РЎР¬": case "РЇР’": case "РЇР’РЁР?": case "РЇР’РЁР?РЎР¬": $rv = substr($rv, 0, 1 - strlen($found[0])); break; default: $rv = substr($rv, 0, -strlen($found[0])); } } else { $rv = preg_replace("/(РЎРЇ|РЎР¬)\$/" . BX_UTF_PCRE_MODIFIER, "", $rv); //ADJECTIVAL if (preg_match($STEMMING_RU_ADJECTIVAL1, $rv, $found)) { $rv = substr($rv, 0, -strlen($found[2])); } elseif (preg_match($STEMMING_RU_ADJECTIVAL2, $rv, $found)) { $rv = substr($rv, 0, -strlen($found[0])); } elseif (preg_match($STEMMING_RU_VERB1, $rv, $found)) { $rv = substr($rv, 0, -strlen($found[2])); } elseif (preg_match($STEMMING_RU_VERB2, $rv, $found)) { $rv = substr($rv, 0, -strlen($found[0])); } else { $rv = preg_replace($STEMMING_RU_NOUN, "", $rv); } } //Step 2: If the word ends with Рё (i), remove it. if (substr($rv, -1) == "Р?") { $rv = substr($rv, 0, -1); } //Step 3: Search for a DERIVATIONAL ending in R2 (i.e. the entire ending must lie in R2), and if one is found, remove it. //R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel. if (preg_match("/(РћРЎРўР¬|РћРЎРў)\$/" . BX_UTF_PCRE_MODIFIER, $rv)) { $R1 = 0; $rv_len = strlen($rv); while ($R1 < $rv_len && strpos($STEMMING_RU_VOWELS, substr($rv, $R1, 1)) !== false) { $R1++; } if ($R1 < $rv_len) { $R1++; } //R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel. $R2 = $R1; while ($R2 < $rv_len && strpos($STEMMING_RU_VOWELS, substr($rv, $R2, 1)) === false) { $R2++; } while ($R2 < $rv_len && strpos($STEMMING_RU_VOWELS, substr($rv, $R2, 1)) !== false) { $R2++; } if ($R2 < $rv_len) { $R2++; } //"РћРЎРўР¬", "РћРЎРў" if (substr($rv, -4) == "РћРЎРўР¬" && $rv_len >= $R2 + 4) { $rv = substr($rv, 0, $rv_len - 4); } elseif (substr($rv, -3) == "РћРЎРў" && $rv_len >= $R2 + 3) { $rv = substr($rv, 0, $rv_len - 3); } } //Step 4: (1) Undouble РЅ (n), or, (2) if the word ends with a SUPERLATIVE ending, remove it and undouble РЅ (n), or (3) if the word ends СЊ (') (soft sign) remove it. $rv = preg_replace("/(ЕЙШЕ|ЕЙШ)\$/" . BX_UTF_PCRE_MODIFIER, "", $rv); $r = preg_replace("/РќРќ\$/" . BX_UTF_PCRE_MODIFIER, "Рќ", $rv); if ($r == $rv) { $rv = preg_replace("/Р¬\$/" . BX_UTF_PCRE_MODIFIER, "", $rv); } else { $rv = $r; } return $word . $rv; }