function relevanssi_tokenize($str, $remove_stops = true, $min_word_length = -1)
{
    $tokens = array();
    if (is_array($str)) {
        foreach ($str as $part) {
            $tokens = array_merge($tokens, relevanssi_tokenize($part, $remove_stops, $min_word_length));
        }
    }
    if (is_array($str)) {
        return $tokens;
    }
    if (function_exists('mb_internal_encoding')) {
        mb_internal_encoding("UTF-8");
    }
    if ($remove_stops) {
        $stopword_list = relevanssi_fetch_stopwords();
    }
    if (function_exists('relevanssi_thousandsep')) {
        $str = relevanssi_thousandsep($str);
    }
    $str = apply_filters('relevanssi_remove_punctuation', $str);
    if (function_exists('mb_strtolower')) {
        $str = mb_strtolower($str);
    } else {
        $str = strtolower($str);
    }
    $t = strtok($str, "\n\t ");
    while ($t !== false) {
        $t = strval($t);
        $accept = true;
        if (strlen($t) < $min_word_length) {
            $t = strtok("\n\t  ");
            continue;
        }
        if ($remove_stops == false) {
            $accept = true;
        } else {
            if (count($stopword_list) > 0) {
                //added by OdditY -> got warning when stopwords table was empty
                if (in_array($t, $stopword_list)) {
                    $accept = false;
                }
            }
        }
        if (RELEVANSSI_PREMIUM) {
            $t = apply_filters('relevanssi_premium_tokenizer', $t);
        }
        if ($accept) {
            $t = relevanssi_mb_trim($t);
            if (is_numeric($t)) {
                $t = " {$t}";
            }
            // $t ends up as an array index, and numbers just don't work there
            if (!isset($tokens[$t])) {
                $tokens[$t] = 1;
            } else {
                $tokens[$t]++;
            }
        }
        $t = strtok("\n\t ");
    }
    return $tokens;
}
Example #2
0
function relevanssi_tokenize($str, $remove_stops = true)
{
    $tokens = array();
    if (is_array($str)) {
        foreach ($str as $part) {
            $tokens = array_merge($tokens, relevanssi_tokenize($part, $remove_stops));
        }
    }
    if (is_array($str)) {
        return $tokens;
    }
    if (function_exists('mb_internal_encoding')) {
        mb_internal_encoding("UTF-8");
    }
    if ($remove_stops) {
        $stopword_list = relevanssi_fetch_stopwords();
    }
    $str = mb_strtolower(apply_filters('relevanssi_remove_punctuation', $str));
    $tokens = array();
    $t = strtok($str, "\n\t ");
    while ($t !== false) {
        $accept = true;
        if ($remove_stops == false) {
            $accept = true;
        } else {
            if (count($stopword_list) > 0) {
                //added by OdditY -> got warning when stopwords table was empty
                if (in_array($t, $stopword_list)) {
                    $accept = false;
                }
            }
        }
        if ($accept) {
            $t = relevanssi_mb_trim($t);
            if (!isset($tokens[$t])) {
                $tokens[$t] = 1;
            } else {
                $tokens[$t]++;
            }
        }
        $t = strtok("\n\t ");
    }
    return $tokens;
}