Пример #1
0
 /**
  * @param nc_search_query_expression_term $item
  * @return nc_search_query_expression_phrase
  */
 public function add_item(nc_search_query_expression_term $item)
 {
     $item->set_boost($this->boost);
     return parent::add_item($item);
 }
Пример #2
0
 /**
  * 1) Обрабатвает термин текстовыми фильтрами, преобразовывает полученные
  *    формы в коды терминов.
  * 2) Добавляет запрос на ранжирование по этому термину (если необходимо).
  * 3) Возвращает массив с кодами, соответствующими всем формам термина (базовые
  *    формы, синонимы).
  *
  * Возвращает массив (zero-based) с кодами всех форм, соответствующих термину.
  * Если термин является стоп-словом, возвращает массив с единственным элементом "" (пустая строка).
  * Если в индексе нет ни одной формы термина, возвращает массив с элементом "____".
  *
  * Таким образом, в возвращаемом массиве всегда должен быть по крайней мере один
  * элемент.
  *
  * @param nc_search_query_expression_term $expression
  * @return array
  */
 protected function process_term(nc_search_query_expression_term $expression)
 {
     $string = $expression->get_value();
     // convert term to base forms
     $base_forms = $this->text_filters->apply('filter', array($string));
     if (!count($base_forms)) {
         // it's a stop-word obviously
         return array("");
     }
     // get codes
     $codes = array_unique($this->provider->get_term_codes($base_forms, false));
     // add the term to the ranking if it is not excluded
     $is_excluded = $this->is_inside("not") || $expression->is_excluded();
     // check whether there is at least one code
     if (count($codes) == 0) {
         // is this term required for all documents?
         $is_required = !$is_excluded && ($this->is_root() || $expression->is_required() || $this->parent_is("phrase") || $this->parent_is("and") && !$this->is_inside("or"));
         if ($is_required) {
             // this query won't produce any results anyway, so we could
             // spare a database request later
             $this->unknown_required_terms[] = $string;
         }
         $codes[] = "____";
         // dummy "non-existent term" code
     } elseif (!$is_excluded) {
         $this->query_builder->add_term_ranking($expression->get_field(), $codes, $expression->get_boost());
     }
     return $codes;
 }
Пример #3
0
 /**
  *
  * @param string $query_string
  * @param boolean $is_recursive_call
  * @return nc_search_query_expression
  */
 public function parse($query_string, $is_recursive_call = false)
 {
     if (!$is_recursive_call) {
         // change string encoding to UTF-8 or ensure it's not broken if it is
         // already UTF-8
         $query_string = mb_convert_encoding($query_string, 'UTF-8', nc_Core::get_object()->NC_CHARSET);
     }
     /*
      * LEXEMES
      *
      * simple/terminal:
      *   term
      *   wildcard*
      *   wildcard?
      *
      * group (inside):
      *   (a b)   -- essentially "a AND b" or "a OR b"
      *   "a b"
      *
      * group (left and right)
      *   AND  &&
      *   OR   ||
      *   [a TO b]
      *   {a TO b}
      *
      * (implicit AND or OR)
      *
      * wrap following expression:
      *   NOT  !
      *
      * modify next expression:
      *   field_name:
      *   +
      *   -    (must be preceded with a whitespace if not at the beginning of the string)
      *
      * modify previous expression:
      *   ^2
      *   ~0.5  (for term: fuzzy search)     --- extracted with the preceding term
      *   ~2    (for phrase: proximity search)
      *
      * special rules:
      *   - terms with both letters and numbers are considered a phrase:
      *       x123y567z → phrase("x 123 y 567 z")
      *       inside quotes: "price usd50" → phrase("price usd 50")
      *   - decimal fractions are considered a phrase:
      *       0.123 → phrase("0 123")
      *       "price 0.12" → phrase("price 0 12")
      */
     $query_remainder = $query_string;
     // part of the query string that is not parsed yet
     $root = null;
     // result of the parsing
     $previous = null;
     // previous expression
     $operator = $this->default_operator;
     // joining operator ("AND", "OR")
     $previous_was_group = false;
     $next_not = $next_required = $next_excluded = false;
     // modifiers for the upcoming token
     $next_field_name = null;
     // field name modifier
     while (true) {
         $expression = null;
         $token = $this->remove_next_token($query_remainder);
         if ($token === null) {
             break;
         }
         // ----- make sense of the received token:
         if ($token == "(") {
             // start of the group?
             $expression = $this->remove_group($query_remainder);
             //may return null if parentheses are not balanced
             if ($expression) {
                 $previous_was_group = true;
             }
         } elseif ($token == '"') {
             // phrase?
             $expression = $this->remove_phrase($query_remainder);
             // may return null if not a phrase
         } elseif (($token == "[" || $token == "{") && nc_search::should('AllowRangeSearch')) {
             // can be an interval
             $expression = $this->remove_interval($query_remainder, $token);
             // may return null if not an interval
         } elseif (substr($token, -1) == ":" && nc_search::should('AllowFieldSearch')) {
             // field name!
             $next_field_name = substr($token, 0, -1);
         } elseif ($token == "+") {
             // "required" sign (not same as AND if default operator is OR)
             $next_required = true;
         } elseif ($token == "-" && !$previous || strlen($token) > 1 && trim($token) == "-") {
             // (a) "excluded" sign at the beginning of the query (not same as NOT if default operator is OR)
             // (b) "excluded" sign elsewhere (separated by the space)
             $next_excluded = true;
         } elseif ($token == "!" || $token == "NOT") {
             // boolean operators are case-sensitive
             $next_not = true;
             // wrap next item inside NOT
         } elseif ($token == "&&" || $token == "AND") {
             $operator = "AND";
         } elseif ($token == "||" || $token == "OR") {
             $operator = "OR";
         } elseif (strpos($token, "~") > 0 && preg_match("/^[{$this->term_chars}]+~/u", $token)) {
             // fuzzy search
             list($term, $similarity) = explode("~", $token);
             // decimal value ("0.5")
             if (nc_search::should('AllowFuzzySearch')) {
                 $expression = new nc_search_query_expression_fuzzy($term, $similarity);
             } else {
                 $expression = new nc_search_query_expression_term($term);
             }
         } elseif ($token[0] == "~" && nc_search::should('AllowProximitySearch')) {
             // phrase word distance option
             $value = substr($token, 1);
             // integer value
             if ($previous instanceof nc_search_query_expression_phrase) {
                 $previous->set_distance($value);
             }
             // no fallback, throw the token out
         } elseif ($token[0] == "^" && nc_search::should('AllowTermBoost')) {
             // term and phrase boost
             $value = substr($token, 1);
             // integer or decimal value
             if ($previous instanceof nc_search_query_expression_term || $previous instanceof nc_search_query_expression_phrase) {
                 $previous->set_boost($value);
             }
             // no fallback, just discard (complicated: decimal value can result in two terms)
         } elseif ((strpos($token, "*") || strpos($token, "?")) && nc_search::should('AllowWildcardSearch')) {
             // wildcard; can't be the first symbol
             $expression = new nc_search_query_expression_wildcard($token);
         } elseif ($this->ignore_numbers && preg_match("/\\d/", $token)) {
             // reset field flag (e.g.: <price:50 term>)
             $next_field_name = null;
         } elseif (ctype_digit($token) && preg_match("/^\\.(\\d+)\\b/", $query_remainder, $match)) {
             // special case: decimal fractions
             $fraction = $match[1];
             $query_remainder = substr($query_remainder, strlen($fraction) + 1);
             $expression = new nc_search_query_expression_phrase(array($token, $fraction));
             // TODO? можно помечать такие фразы, чтобы транслировать их в FTS-фразы, а не в REGEXP-выражения
         } elseif (preg_match("/^[{$this->term_chars}]+\$/u", $token)) {
             // special case: treat terms with both letters and numbers as a phrase
             if (preg_match("/\\d/", $token)) {
                 $parts = preg_split("/(\\d+)/", $token, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
                 $expression = sizeof($parts) == 1 ? new nc_search_query_expression_term($parts[0]) : new nc_search_query_expression_phrase($parts);
             } else {
                 $expression = new nc_search_query_expression_term($token);
             }
         } else {
             // discard unknown tokens
             continue;
         }
         // -----
         // process next token if current token didn't produce an expression
         if (!$expression) {
             continue;
         }
         // -----
         // set expression flags / options
         $expression->set_field($next_field_name)->set_required($next_required)->set_excluded($next_excluded);
         // reset flags
         $next_field_name = null;
         $next_required = $next_excluded = false;
         if ($next_not) {
             // wrap inside NOT()
             $expression = new nc_search_query_expression_not($expression);
             $next_not = false;
         }
         // store expression in the $root tree
         if ($root == null) {
             // first item
             $root = $expression;
         } else {
             // not a first item
             if ($root instanceof nc_search_query_expression_or) {
                 if ($operator == "OR") {
                     // OR+OR=OR
                     $root->add_item($expression);
                 } elseif ($previous_was_group) {
                     // (one OR two) AND three
                     $root = $this->create_boolean($operator, $root, $expression);
                 } else {
                     // replace last item in OR with an AND expression
                     // (t1 OR t2 AND t3) → OR(t1, AND(t2, t3))
                     // (t1 OR t2 AND t3 AND t4) → OR(t1, AND(t2, t3, t4))
                     $root->conjunct_last($expression);
                 }
             } elseif ($root instanceof nc_search_query_expression_and && $operator == "AND") {
                 $root->add_item($expression);
                 // AND+AND=AND
             } else {
                 // (root=AND && operator=OR) --or-- (root is not boolean)
                 // (t1 AND t2 OR t3) → OR(AND(t1, t2), t3)
                 $root = $this->create_boolean($operator, $root, $expression);
             }
             // reset flag
             $previous_was_group = false;
         }
         // reset $operator:
         $operator = $this->default_operator;
         // remember previous expression:
         $previous = $expression;
     }
     // of "while tokens are coming"
     return $root ? $root : new nc_search_query_expression_empty();
 }
Пример #4
0
 /**
  * @param nc_search_query_expression_term $expression
  * @return void
  */
 protected function translate_term(nc_search_query_expression_term $expression)
 {
     $this->print_line('TERM "' . $expression->get_value() . '"' . $this->get_modifiers($expression));
 }