function clean_text_match($text, $ltrim_star = true, $remove_stopwords = false, $die_if_empty = false) { global $bb_cfg, $lang; $text = str_compact($text); $ltrim_chars = $ltrim_star ? ' *-!' : ' '; $wrap_with_quotes = preg_match('#^"[^"]+"$#', $text); $text = ' ' . str_compact(ltrim($text, $ltrim_chars)) . ' '; if ($remove_stopwords) { $text = remove_stopwords($text); } if ($bb_cfg['search_engine_type'] == 'sphinx') { $text = preg_replace('#(?<=\\S)\\-#u', ' ', $text); // "1-2-3" -> "1 2 3" $text = preg_replace('#[^0-9a-zA-Zа-яА-ЯёЁ\\-_*|]#u', ' ', $text); // допустимые символы (кроме " которые отдельно) $text = str_replace('-', ' -', $text); // - только в начале слова $text = str_replace('*', '* ', $text); // * только в конце слова $text = preg_replace('#\\s*\\|\\s*#u', '|', $text); // "| " -> "|" $text = preg_replace('#\\|+#u', ' | ', $text); // "||" -> "|" $text = preg_replace('#(?<=\\s)[\\-*]+\\s#u', ' ', $text); // одиночные " - ", " * " $text = trim($text, ' -|'); $text = str_compact($text); $text_match_sql = $wrap_with_quotes && $text != '' ? '"' . $text . '"' : $text; } else { $text_match_sql = DB()->escape(trim($text)); } if (!$text_match_sql && $die_if_empty) { bb_die($lang['NO_SEARCH_MATCH']); } return $text_match_sql; }
function extract_search_words($text) { global $bb_cfg; $max_words_count = $bb_cfg['max_search_words_per_post']; $min_word_len = max(2, $bb_cfg['search_min_word_len'] - 1); $max_word_len = $bb_cfg['search_max_word_len']; $text = ' ' . str_compact(strip_tags(mb_strtolower($text))) . ' '; $text = str_replace(array('[', ']'), array('[', ']'), $text); // HTML entities like $text = preg_replace('/(\\w*?)&#?[0-9a-z]+;(\\w*?)/iu', '', $text); // Remove URL's ((www|ftp)\.[\w\#!$%&~/.\-;:=,?@а-яА-Я\[\]+]*?) $text = preg_replace('#\\b[a-z0-9]+://[\\w\\#!$%&~/.\\-;:=,?@а-яА-Я\\[\\]+]+(/[0-9a-z\\?\\.%_\\-\\+=&/]+)?#u', ' ', $text); $text = str_replace('[url=', ' ', $text); $text = str_replace('?', ' ', $text); $text = str_replace('!', ' ', $text); $text = strip_bbcode($text); // Filter out characters like ^, $, &, change "it's" to "its" $text = preg_replace('#[.,:;]#u', ' ', $text); // short & long words // $text = preg_replace('#(?<=^|\s)(\S{1,'.$min_word_len.'}|\S{'.$max_word_len.',}|\W*)(?=$|\s)#u', ' ', $text); $text = remove_stopwords($text); # $text = replace_synonyms($text); // Trim 1+ spaces to one space and split this string into unique words $text = array_unique(explode(' ', str_compact($text))); // short & long words 2 $text_out = array(); foreach ($text as $word) { if (mb_strlen($word) > $min_word_len && mb_strlen($word) <= $max_word_len) { $text_out[] = $word; } } $text = $text_out; if (sizeof($text) > $max_words_count) { # shuffle($text); $text = array_splice($text, 0, $max_words_count); } return $text; }
$count = 0; if (!$result) { echo 'no result available'; die; } else { $stopwords = load_stopwords('stopwords.txt'); $keyword_occur = array(); $previous_keyword_string = 'thequickbrownfoxjumpsoverthelazydog'; $previous_keywords = array(); while ($row = mysql_fetch_array($result)) { $query_string = $row['keywords']; $keyword_string = extract_keywords($query_string); $product = $row['url']; if ($keyword_string != $previous_keyword_string) { $keywords = keywords_array($keyword_string); $keywords = remove_stopwords($keywords, $stopwords); foreach ($keywords as $keyword) { if (!isset($keyword_occur[$keyword])) { $keyword_occur[$keyword] = 0; } } $previous_keyword_string = $keyword_string; $previous_keywords = $keywords; } if (count($previous_keywords) == 0) { continue; } //avoid unnecessary insertion //TODO hey! you forget to check whether the pair already exist in db! $insert_query = 'INSERT INTO m2_keyword_product (keyword, product) VALUES '; foreach ($previous_keywords as $keyword) {