/** * @param nc_search_query_expression_term $item * @return nc_search_query_expression_phrase */ public function add_item(nc_search_query_expression_term $item) { $item->set_boost($this->boost); return parent::add_item($item); }
/** * 1) Обрабатвает термин текстовыми фильтрами, преобразовывает полученные * формы в коды терминов. * 2) Добавляет запрос на ранжирование по этому термину (если необходимо). * 3) Возвращает массив с кодами, соответствующими всем формам термина (базовые * формы, синонимы). * * Возвращает массив (zero-based) с кодами всех форм, соответствующих термину. * Если термин является стоп-словом, возвращает массив с единственным элементом "" (пустая строка). * Если в индексе нет ни одной формы термина, возвращает массив с элементом "____". * * Таким образом, в возвращаемом массиве всегда должен быть по крайней мере один * элемент. * * @param nc_search_query_expression_term $expression * @return array */ protected function process_term(nc_search_query_expression_term $expression) { $string = $expression->get_value(); // convert term to base forms $base_forms = $this->text_filters->apply('filter', array($string)); if (!count($base_forms)) { // it's a stop-word obviously return array(""); } // get codes $codes = array_unique($this->provider->get_term_codes($base_forms, false)); // add the term to the ranking if it is not excluded $is_excluded = $this->is_inside("not") || $expression->is_excluded(); // check whether there is at least one code if (count($codes) == 0) { // is this term required for all documents? $is_required = !$is_excluded && ($this->is_root() || $expression->is_required() || $this->parent_is("phrase") || $this->parent_is("and") && !$this->is_inside("or")); if ($is_required) { // this query won't produce any results anyway, so we could // spare a database request later $this->unknown_required_terms[] = $string; } $codes[] = "____"; // dummy "non-existent term" code } elseif (!$is_excluded) { $this->query_builder->add_term_ranking($expression->get_field(), $codes, $expression->get_boost()); } return $codes; }
/** * * @param string $query_string * @param boolean $is_recursive_call * @return nc_search_query_expression */ public function parse($query_string, $is_recursive_call = false) { if (!$is_recursive_call) { // change string encoding to UTF-8 or ensure it's not broken if it is // already UTF-8 $query_string = mb_convert_encoding($query_string, 'UTF-8', nc_Core::get_object()->NC_CHARSET); } /* * LEXEMES * * simple/terminal: * term * wildcard* * wildcard? * * group (inside): * (a b) -- essentially "a AND b" or "a OR b" * "a b" * * group (left and right) * AND && * OR || * [a TO b] * {a TO b} * * (implicit AND or OR) * * wrap following expression: * NOT ! * * modify next expression: * field_name: * + * - (must be preceded with a whitespace if not at the beginning of the string) * * modify previous expression: * ^2 * ~0.5 (for term: fuzzy search) --- extracted with the preceding term * ~2 (for phrase: proximity search) * * special rules: * - terms with both letters and numbers are considered a phrase: * x123y567z → phrase("x 123 y 567 z") * inside quotes: "price usd50" → phrase("price usd 50") * - decimal fractions are considered a phrase: * 0.123 → phrase("0 123") * "price 0.12" → phrase("price 0 12") */ $query_remainder = $query_string; // part of the query string that is not parsed yet $root = null; // result of the parsing $previous = null; // previous expression $operator = $this->default_operator; // joining operator ("AND", "OR") $previous_was_group = false; $next_not = $next_required = $next_excluded = false; // modifiers for the upcoming token $next_field_name = null; // field name modifier while (true) { $expression = null; $token = $this->remove_next_token($query_remainder); if ($token === null) { break; } // ----- make sense of the received token: if ($token == "(") { // start of the group? $expression = $this->remove_group($query_remainder); //may return null if parentheses are not balanced if ($expression) { $previous_was_group = true; } } elseif ($token == '"') { // phrase? $expression = $this->remove_phrase($query_remainder); // may return null if not a phrase } elseif (($token == "[" || $token == "{") && nc_search::should('AllowRangeSearch')) { // can be an interval $expression = $this->remove_interval($query_remainder, $token); // may return null if not an interval } elseif (substr($token, -1) == ":" && nc_search::should('AllowFieldSearch')) { // field name! $next_field_name = substr($token, 0, -1); } elseif ($token == "+") { // "required" sign (not same as AND if default operator is OR) $next_required = true; } elseif ($token == "-" && !$previous || strlen($token) > 1 && trim($token) == "-") { // (a) "excluded" sign at the beginning of the query (not same as NOT if default operator is OR) // (b) "excluded" sign elsewhere (separated by the space) $next_excluded = true; } elseif ($token == "!" || $token == "NOT") { // boolean operators are case-sensitive $next_not = true; // wrap next item inside NOT } elseif ($token == "&&" || $token == "AND") { $operator = "AND"; } elseif ($token == "||" || $token == "OR") { $operator = "OR"; } elseif (strpos($token, "~") > 0 && preg_match("/^[{$this->term_chars}]+~/u", $token)) { // fuzzy search list($term, $similarity) = explode("~", $token); // decimal value ("0.5") if (nc_search::should('AllowFuzzySearch')) { $expression = new nc_search_query_expression_fuzzy($term, $similarity); } else { $expression = new nc_search_query_expression_term($term); } } elseif ($token[0] == "~" && nc_search::should('AllowProximitySearch')) { // phrase word distance option $value = substr($token, 1); // integer value if ($previous instanceof nc_search_query_expression_phrase) { $previous->set_distance($value); } // no fallback, throw the token out } elseif ($token[0] == "^" && nc_search::should('AllowTermBoost')) { // term and phrase boost $value = substr($token, 1); // integer or decimal value if ($previous instanceof nc_search_query_expression_term || $previous instanceof nc_search_query_expression_phrase) { $previous->set_boost($value); } // no fallback, just discard (complicated: decimal value can result in two terms) } elseif ((strpos($token, "*") || strpos($token, "?")) && nc_search::should('AllowWildcardSearch')) { // wildcard; can't be the first symbol $expression = new nc_search_query_expression_wildcard($token); } elseif ($this->ignore_numbers && preg_match("/\\d/", $token)) { // reset field flag (e.g.: <price:50 term>) $next_field_name = null; } elseif (ctype_digit($token) && preg_match("/^\\.(\\d+)\\b/", $query_remainder, $match)) { // special case: decimal fractions $fraction = $match[1]; $query_remainder = substr($query_remainder, strlen($fraction) + 1); $expression = new nc_search_query_expression_phrase(array($token, $fraction)); // TODO? можно помечать такие фразы, чтобы транслировать их в FTS-фразы, а не в REGEXP-выражения } elseif (preg_match("/^[{$this->term_chars}]+\$/u", $token)) { // special case: treat terms with both letters and numbers as a phrase if (preg_match("/\\d/", $token)) { $parts = preg_split("/(\\d+)/", $token, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); $expression = sizeof($parts) == 1 ? new nc_search_query_expression_term($parts[0]) : new nc_search_query_expression_phrase($parts); } else { $expression = new nc_search_query_expression_term($token); } } else { // discard unknown tokens continue; } // ----- // process next token if current token didn't produce an expression if (!$expression) { continue; } // ----- // set expression flags / options $expression->set_field($next_field_name)->set_required($next_required)->set_excluded($next_excluded); // reset flags $next_field_name = null; $next_required = $next_excluded = false; if ($next_not) { // wrap inside NOT() $expression = new nc_search_query_expression_not($expression); $next_not = false; } // store expression in the $root tree if ($root == null) { // first item $root = $expression; } else { // not a first item if ($root instanceof nc_search_query_expression_or) { if ($operator == "OR") { // OR+OR=OR $root->add_item($expression); } elseif ($previous_was_group) { // (one OR two) AND three $root = $this->create_boolean($operator, $root, $expression); } else { // replace last item in OR with an AND expression // (t1 OR t2 AND t3) → OR(t1, AND(t2, t3)) // (t1 OR t2 AND t3 AND t4) → OR(t1, AND(t2, t3, t4)) $root->conjunct_last($expression); } } elseif ($root instanceof nc_search_query_expression_and && $operator == "AND") { $root->add_item($expression); // AND+AND=AND } else { // (root=AND && operator=OR) --or-- (root is not boolean) // (t1 AND t2 OR t3) → OR(AND(t1, t2), t3) $root = $this->create_boolean($operator, $root, $expression); } // reset flag $previous_was_group = false; } // reset $operator: $operator = $this->default_operator; // remember previous expression: $previous = $expression; } // of "while tokens are coming" return $root ? $root : new nc_search_query_expression_empty(); }
/** * @param nc_search_query_expression_term $expression * @return void */ protected function translate_term(nc_search_query_expression_term $expression) { $this->print_line('TERM "' . $expression->get_value() . '"' . $this->get_modifiers($expression)); }