Exemple #1
0
function stemming_ru($word, $flags = 0)
{
    global $STEMMING_RU_VOWELS;
    global $STEMMING_RU_PERFECTIVE_GERUND;
    global $STEMMING_RU_ADJECTIVAL1;
    global $STEMMING_RU_ADJECTIVAL2;
    global $STEMMING_RU_VERB1;
    global $STEMMING_RU_VERB2;
    global $STEMMING_RU_NOUN;
    //There is a 33rd letter, ё (?), but it is rarely used, and we assume it is mapped into е (e).
    $word = str_replace("Ё", "Е", $word);
    //Exceptions
    static $STEMMING_RU_EX = array("БЕЗЕ" => "БЕЗЕ", "БЫЛЬ" => "БЫЛЬ", "МЕНЮ" => "МЕНЮ", "ГРАНАТ" => "ГРАНАТ", "ГРАНИТ" => "ГРАНИТ", "ТЕРМИНАЛ" => "ТЕРМИНАЛ", "ИЛИ" => "ИЛИ", "РУКАВ" => "РУКАВ", "ПРИЕМ" => "ПРИЕМ", "ОХРАНА" => "ОХРАН");
    if (isset($STEMMING_RU_EX[$word])) {
        return $word;
    }
    //HERE IS AN ATTEMPT TO STEM RUSSIAN SECOND NAMES BEGINS
    //http://www.gramma.ru/SPR/?id=2.8
    if ($flags & 1) {
        if (preg_match("/(ОВ|ЕВ)\$/", $word)) {
            return array(stemming_ru($word . "А"), stemming_ru($word));
        }
        $found = array();
        if (preg_match("/(ОВ|ЕВ)(А|У|ЫМ|Е)\$/", $word, $found)) {
            return array(stemming_ru($word), stemming_ru(substr($word, 0, -strlen($found[2]))));
        }
    }
    //HERE IS AN ATTEMPT TO STEM RUSSIAN SECOND NAMES ENDS
    //In any word, RV is the region after the first vowel, or the end of the word if it contains no vowel.
    //All tests take place in the the RV part of the word.
    $found = array();
    if (preg_match("/^(.*?[{$STEMMING_RU_VOWELS}])(.+)\$/" . BX_UTF_PCRE_MODIFIER, $word, $found)) {
        $rv = $found[2];
        $word = $found[1];
    } else {
        return $word;
    }
    //Do each of steps 1, 2, 3 and 4.
    //Step 1: Search for a PERFECTIVE GERUND ending. If one is found remove it, and that is then the end of step 1.
    if (preg_match($STEMMING_RU_PERFECTIVE_GERUND, $rv, $found)) {
        switch ($found[0]) {
            case "АВ":
            case "АВШИ":
            case "АВШИСЬ":
            case "ЯВ":
            case "ЯВШИ":
            case "ЯВШИСЬ":
                $rv = substr($rv, 0, 1 - strlen($found[0]));
                break;
            default:
                $rv = substr($rv, 0, -strlen($found[0]));
        }
    } else {
        $rv = preg_replace("/(СЯ|СЬ)\$/" . BX_UTF_PCRE_MODIFIER, "", $rv);
        //ADJECTIVAL
        if (preg_match($STEMMING_RU_ADJECTIVAL1, $rv, $found)) {
            $rv = substr($rv, 0, -strlen($found[2]));
        } elseif (preg_match($STEMMING_RU_ADJECTIVAL2, $rv, $found)) {
            $rv = substr($rv, 0, -strlen($found[0]));
        } elseif (preg_match($STEMMING_RU_VERB1, $rv, $found)) {
            $rv = substr($rv, 0, -strlen($found[2]));
        } elseif (preg_match($STEMMING_RU_VERB2, $rv, $found)) {
            $rv = substr($rv, 0, -strlen($found[0]));
        } else {
            $rv = preg_replace($STEMMING_RU_NOUN, "", $rv);
        }
    }
    //Step 2: If the word ends with и (i), remove it.
    if (substr($rv, -1) == "И") {
        $rv = substr($rv, 0, -1);
    }
    //Step 3: Search for a DERIVATIONAL ending in R2 (i.e. the entire ending must lie in R2), and if one is found, remove it.
    //R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
    if (preg_match("/(ОСТЬ|ОСТ)\$/" . BX_UTF_PCRE_MODIFIER, $rv)) {
        $R1 = 0;
        $rv_len = strlen($rv);
        while ($R1 < $rv_len && strpos($STEMMING_RU_VOWELS, substr($rv, $R1, 1)) !== false) {
            $R1++;
        }
        if ($R1 < $rv_len) {
            $R1++;
        }
        //R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel.
        $R2 = $R1;
        while ($R2 < $rv_len && strpos($STEMMING_RU_VOWELS, substr($rv, $R2, 1)) === false) {
            $R2++;
        }
        while ($R2 < $rv_len && strpos($STEMMING_RU_VOWELS, substr($rv, $R2, 1)) !== false) {
            $R2++;
        }
        if ($R2 < $rv_len) {
            $R2++;
        }
        //"ОСТЬ", "ОСТ"
        if (substr($rv, -4) == "ОСТЬ" && $rv_len >= $R2 + 4) {
            $rv = substr($rv, 0, $rv_len - 4);
        } elseif (substr($rv, -3) == "ОСТ" && $rv_len >= $R2 + 3) {
            $rv = substr($rv, 0, $rv_len - 3);
        }
    }
    //Step 4: (1) Undouble н (n), or, (2) if the word ends with a SUPERLATIVE ending, remove it and undouble н (n), or (3) if the word ends ь (') (soft sign) remove it.
    $rv = preg_replace("/(ЕЙШЕ|ЕЙШ)\$/" . BX_UTF_PCRE_MODIFIER, "", $rv);
    $r = preg_replace("/НН\$/" . BX_UTF_PCRE_MODIFIER, "Н", $rv);
    if ($r == $rv) {
        $rv = preg_replace("/Ь\$/" . BX_UTF_PCRE_MODIFIER, "", $rv);
    } else {
        $rv = $r;
    }
    return $word . $rv;
}
function stemming_ru($word, $flags = 0)
{
    global $STEMMING_RU_VOWELS;
    global $STEMMING_RU_PERFECTIVE_GERUND;
    global $STEMMING_RU_ADJECTIVAL1;
    global $STEMMING_RU_ADJECTIVAL2;
    global $STEMMING_RU_VERB1;
    global $STEMMING_RU_VERB2;
    global $STEMMING_RU_NOUN;
    //There is a 33rd letter, ╦ (?), but it is rarely used, and we assume it is mapped into Е (e).
    $word = str_replace("╗", "е", $word);
    //Exceptions
    static $STEMMING_RU_EX = array("аеге" => true, "ашкэ" => true, "лемч" => true, "цпюмюр" => true, "цпюмхр" => true, "реплхмюк" => true, "хкх" => true, "псйюб" => true, "опхел" => true);
    if (isset($STEMMING_RU_EX[$word])) {
        return $word;
    }
    //HERE IS AN ATTEMPT TO STEM RUSSIAN SECOND NAMES BEGINS
    //http://www.gramma.ru/SPR/?id=2.8
    if ($flags & 1) {
        if (preg_match("/(нб|еб)\$/", $word)) {
            return array(stemming_ru($word . "ю"), stemming_ru($word));
        }
        if (preg_match("/(нб|еб)(ю|с|шл|е)\$/", $word, $found)) {
            return array(stemming_ru($word), stemming_ru(substr($word, 0, -strlen($found[2]))));
        }
    }
    //HERE IS AN ATTEMPT TO STEM RUSSIAN SECOND NAMES ENDS
    //In any word, RV is the region after the first vowel, or the end of the word if it contains no vowel.
    //All tests take place in the the RV part of the word.
    $found = array();
    if (preg_match("/^(.*?[{$STEMMING_RU_VOWELS}])(.+)\$/" . BX_UTF_PCRE_MODIFIER, $word, $found)) {
        $rv = $found[2];
        $word = $found[1];
    } else {
        return $word;
    }
    //Do each of steps 1, 2, 3 and 4.
    //Step 1: Search for a PERFECTIVE GERUND ending. If one is found remove it, and that is then the end of step 1.
    if (preg_match($STEMMING_RU_PERFECTIVE_GERUND, $rv, $found)) {
        switch ($found[0]) {
            case "юб":
            case "юбьх":
            case "юбьхяэ":
            case "ъб":
            case "ъбьх":
            case "ъбьхяэ":
                $rv = substr($rv, 0, 1 - strlen($found[0]));
                break;
            default:
                $rv = substr($rv, 0, -strlen($found[0]));
        }
    } else {
        $rv = preg_replace("/(яъ|яэ)\$/" . BX_UTF_PCRE_MODIFIER, "", $rv);
        //ADJECTIVAL
        if (preg_match($STEMMING_RU_ADJECTIVAL1, $rv, $found)) {
            $rv = substr($rv, 0, -strlen($found[2]));
        } elseif (preg_match($STEMMING_RU_ADJECTIVAL2, $rv, $found)) {
            $rv = substr($rv, 0, -strlen($found[0]));
        } elseif (preg_match($STEMMING_RU_VERB1, $rv, $found)) {
            $rv = substr($rv, 0, -strlen($found[2]));
        } elseif (preg_match($STEMMING_RU_VERB2, $rv, $found)) {
            $rv = substr($rv, 0, -strlen($found[0]));
        } else {
            $rv = preg_replace($STEMMING_RU_NOUN, "", $rv);
        }
    }
    //Step 2: If the word ends with Х (i), remove it.
    if (substr($rv, -1) == "х") {
        $rv = substr($rv, 0, -1);
    }
    //Step 3: Search for a DERIVATIONAL ending in R2 (i.e. the entire ending must lie in R2), and if one is found, remove it.
    //R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
    if (preg_match("/(нярэ|няр)\$/" . BX_UTF_PCRE_MODIFIER, $rv)) {
        $R1 = 0;
        $rv_len = strlen($rv);
        while ($R1 < $rv_len && strpos($STEMMING_RU_VOWELS, substr($rv, $R1, 1)) !== false) {
            $R1++;
        }
        if ($R1 < $rv_len) {
            $R1++;
        }
        //R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel.
        $R2 = $R1;
        while ($R2 < $rv_len && strpos($STEMMING_RU_VOWELS, substr($rv, $R2, 1)) === false) {
            $R2++;
        }
        while ($R2 < $rv_len && strpos($STEMMING_RU_VOWELS, substr($rv, $R2, 1)) !== false) {
            $R2++;
        }
        if ($R2 < $rv_len) {
            $R2++;
        }
        //"нярэ", "няр"
        if (substr($rv, -4) == "нярэ" && $rv_len >= $R2 + 4) {
            $rv = substr($rv, 0, $rv_len - 4);
        } elseif (substr($rv, -3) == "няр" && $rv_len >= $R2 + 3) {
            $rv = substr($rv, 0, $rv_len - 3);
        }
    }
    //Step 4: (1) Undouble М (n), or, (2) if the word ends with a SUPERLATIVE ending, remove it and undouble М (n), or (3) if the word ends Э (') (soft sign) remove it.
    $rv = preg_replace("/(еиье|еиь)\$/" . BX_UTF_PCRE_MODIFIER, "", $rv);
    $r = preg_replace("/мм\$/" . BX_UTF_PCRE_MODIFIER, "м", $rv);
    if ($r == $rv) {
        $rv = preg_replace("/э\$/" . BX_UTF_PCRE_MODIFIER, "", $rv);
    } else {
        $rv = $r;
    }
    return $word . $rv;
}
Exemple #3
0
function stemming_ru($word, $flags = 0)
{
    global $STEMMING_RU_VOWELS;
    global $STEMMING_RU_PERFECTIVE_GERUND;
    global $STEMMING_RU_ADJECTIVAL1;
    global $STEMMING_RU_ADJECTIVAL2;
    global $STEMMING_RU_VERB1;
    global $STEMMING_RU_VERB2;
    global $STEMMING_RU_NOUN;
    //There is a 33rd letter, С‘ (?), but it is rarely used, and we assume it is mapped into Рµ (e).
    $word = str_replace("РЃ", "Р•", $word);
    //Exceptions
    static $STEMMING_RU_EX = array("БЕЗЕ" => true, "БЫЛЬ" => true, "МЕНЮ" => true, "ГРАНАТ" => true, "ГРАНР?Рў" => true, "ТЕРМР?РќРђР›" => true, "Р?Р›Р?" => true, "Р РЈРљРђР’" => true, "РџР Р?ЕМ" => true);
    if (isset($STEMMING_RU_EX[$word])) {
        return $word;
    }
    //HERE IS AN ATTEMPT TO STEM RUSSIAN SECOND NAMES BEGINS
    //http://www.gramma.ru/SPR/?id=2.8
    if ($flags & 1) {
        if (preg_match("/(ОВ|ЕВ)\$/", $word)) {
            return array(stemming_ru($word . "Рђ"), stemming_ru($word));
        }
        if (preg_match("/(ОВ|ЕВ)(А|У|ЫМ|Е)\$/", $word, $found)) {
            return array(stemming_ru($word), stemming_ru(substr($word, 0, -strlen($found[2]))));
        }
    }
    //HERE IS AN ATTEMPT TO STEM RUSSIAN SECOND NAMES ENDS
    //In any word, RV is the region after the first vowel, or the end of the word if it contains no vowel.
    //All tests take place in the the RV part of the word.
    $found = array();
    if (preg_match("/^(.*?[{$STEMMING_RU_VOWELS}])(.+)\$/" . BX_UTF_PCRE_MODIFIER, $word, $found)) {
        $rv = $found[2];
        $word = $found[1];
    } else {
        return $word;
    }
    //Do each of steps 1, 2, 3 and 4.
    //Step 1: Search for a PERFECTIVE GERUND ending. If one is found remove it, and that is then the end of step 1.
    if (preg_match($STEMMING_RU_PERFECTIVE_GERUND, $rv, $found)) {
        switch ($found[0]) {
            case "РђР’":
            case "РђР’РЁР?":
            case "РђР’РЁР?РЎР¬":
            case "РЇР’":
            case "РЇР’РЁР?":
            case "РЇР’РЁР?РЎР¬":
                $rv = substr($rv, 0, 1 - strlen($found[0]));
                break;
            default:
                $rv = substr($rv, 0, -strlen($found[0]));
        }
    } else {
        $rv = preg_replace("/(РЎРЇ|РЎР¬)\$/" . BX_UTF_PCRE_MODIFIER, "", $rv);
        //ADJECTIVAL
        if (preg_match($STEMMING_RU_ADJECTIVAL1, $rv, $found)) {
            $rv = substr($rv, 0, -strlen($found[2]));
        } elseif (preg_match($STEMMING_RU_ADJECTIVAL2, $rv, $found)) {
            $rv = substr($rv, 0, -strlen($found[0]));
        } elseif (preg_match($STEMMING_RU_VERB1, $rv, $found)) {
            $rv = substr($rv, 0, -strlen($found[2]));
        } elseif (preg_match($STEMMING_RU_VERB2, $rv, $found)) {
            $rv = substr($rv, 0, -strlen($found[0]));
        } else {
            $rv = preg_replace($STEMMING_RU_NOUN, "", $rv);
        }
    }
    //Step 2: If the word ends with Рё (i), remove it.
    if (substr($rv, -1) == "Р?") {
        $rv = substr($rv, 0, -1);
    }
    //Step 3: Search for a DERIVATIONAL ending in R2 (i.e. the entire ending must lie in R2), and if one is found, remove it.
    //R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
    if (preg_match("/(РћРЎРўР¬|РћРЎРў)\$/" . BX_UTF_PCRE_MODIFIER, $rv)) {
        $R1 = 0;
        $rv_len = strlen($rv);
        while ($R1 < $rv_len && strpos($STEMMING_RU_VOWELS, substr($rv, $R1, 1)) !== false) {
            $R1++;
        }
        if ($R1 < $rv_len) {
            $R1++;
        }
        //R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel.
        $R2 = $R1;
        while ($R2 < $rv_len && strpos($STEMMING_RU_VOWELS, substr($rv, $R2, 1)) === false) {
            $R2++;
        }
        while ($R2 < $rv_len && strpos($STEMMING_RU_VOWELS, substr($rv, $R2, 1)) !== false) {
            $R2++;
        }
        if ($R2 < $rv_len) {
            $R2++;
        }
        //"РћРЎРўР¬", "РћРЎРў"
        if (substr($rv, -4) == "РћРЎРўР¬" && $rv_len >= $R2 + 4) {
            $rv = substr($rv, 0, $rv_len - 4);
        } elseif (substr($rv, -3) == "РћРЎРў" && $rv_len >= $R2 + 3) {
            $rv = substr($rv, 0, $rv_len - 3);
        }
    }
    //Step 4: (1) Undouble РЅ (n), or, (2) if the word ends with a SUPERLATIVE ending, remove it and undouble РЅ (n), or (3) if the word ends СЊ (') (soft sign) remove it.
    $rv = preg_replace("/(ЕЙШЕ|ЕЙШ)\$/" . BX_UTF_PCRE_MODIFIER, "", $rv);
    $r = preg_replace("/РќРќ\$/" . BX_UTF_PCRE_MODIFIER, "Рќ", $rv);
    if ($r == $rv) {
        $rv = preg_replace("/Р¬\$/" . BX_UTF_PCRE_MODIFIER, "", $rv);
    } else {
        $rv = $r;
    }
    return $word . $rv;
}