/** * Tests that search_simplify() does the right thing with punctuation. */ function testSearchSimplifyPunctuation() { $cases = array(array('20.03/94-28,876', '20039428876', 'Punctuation removed from numbers'), array('great...drupal--module', 'great drupal module', 'Multiple dot and dashes are word boundaries'), array('very_great-drupal.module', 'verygreatdrupalmodule', 'Single dot, dash, underscore are removed'), array('regular,punctuation;word', 'regular punctuation word', 'Punctuation is a word boundary')); foreach ($cases as $case) { $out = trim(search_simplify($case[0])); $this->assertEqual($out, $case[1], $case[2]); } }
/** * Verifies that strings of non-CJK characters are not tokenized. * * This is just a sanity check - it verifies that strings of letters are * not tokenized. */ function testNoTokenizer() { // Set the minimum word size to 1 (to split all CJK characters) and make // sure CJK tokenizing is turned on. $this->config('search.settings')->set('index.minimum_word_size', 1)->set('index.overlap_cjk', TRUE)->save(); $this->refreshVariables(); $letters = 'abcdefghijklmnopqrstuvwxyz'; $out = trim(search_simplify($letters)); $this->assertEqual($letters, $out, 'Letters are not CJK tokenized'); }
/** * Parses the search query into SQL conditions. * * Sets up the following variables: * - $this->keys * - $this->words * - $this->conditions * - $this->simple * - $this->matches */ protected function parseSearchExpression() { // Matches words optionally prefixed by a - sign. A word in this case is // something between two spaces, optionally quoted. preg_match_all('/ (-?)("[^"]+"|[^" ]+)/i', ' ' . $this->searchExpression, $keywords, PREG_SET_ORDER); if (count($keywords) == 0) { return; } // Classify tokens. $or = FALSE; $limit_combinations = \Drupal::config('search.settings')->get('and_or_limit'); // The first search expression does not count as AND. $and_count = -1; $or_count = 0; foreach ($keywords as $match) { if ($or_count && $and_count + $or_count >= $limit_combinations) { // Ignore all further search expressions to prevent Denial-of-Service // attacks using a high number of AND/OR combinations. $this->status |= SearchQuery::EXPRESSIONS_IGNORED; break; } $phrase = FALSE; // Strip off phrase quotes. if ($match[2][0] == '"') { $match[2] = substr($match[2], 1, -1); $phrase = TRUE; $this->simple = FALSE; } // Simplify keyword according to indexing rules and external // preprocessors. Use same process as during search indexing, so it // will match search index. $words = search_simplify($match[2]); // Re-explode in case simplification added more words, except when // matching a phrase. $words = $phrase ? array($words) : preg_split('/ /', $words, -1, PREG_SPLIT_NO_EMPTY); // Negative matches. if ($match[1] == '-') { $this->keys['negative'] = array_merge($this->keys['negative'], $words); } elseif ($match[2] == 'OR' && count($this->keys['positive'])) { $last = array_pop($this->keys['positive']); // Starting a new OR? if (!is_array($last)) { $last = array($last); } $this->keys['positive'][] = $last; $or = TRUE; $or_count++; continue; } elseif ($match[2] == 'AND' || $match[2] == 'and') { continue; } else { if ($match[2] == 'or') { // Lower-case "or" instead of "OR" is a warning condition. $this->status |= SearchQuery::LOWER_CASE_OR; } if ($or) { // Add to last element (which is an array). $this->keys['positive'][count($this->keys['positive']) - 1] = array_merge($this->keys['positive'][count($this->keys['positive']) - 1], $words); } else { $this->keys['positive'] = array_merge($this->keys['positive'], $words); $and_count++; } } $or = FALSE; } // Convert keywords into SQL statements. $simple_and = FALSE; $simple_or = FALSE; // Positive matches. foreach ($this->keys['positive'] as $key) { // Group of ORed terms. if (is_array($key) && count($key)) { $simple_or = TRUE; $any = FALSE; $queryor = db_or(); foreach ($key as $or) { list($num_new_scores) = $this->parseWord($or); $any |= $num_new_scores; $queryor->condition('d.data', "% {$or} %", 'LIKE'); } if (count($queryor)) { $this->conditions->condition($queryor); // A group of OR keywords only needs to match once. $this->matches += $any > 0; } } else { $simple_and = TRUE; list($num_new_scores, $num_valid_words) = $this->parseWord($key); $this->conditions->condition('d.data', "% {$key} %", 'LIKE'); if (!$num_valid_words) { $this->simple = FALSE; } // Each AND keyword needs to match at least once. $this->matches += $num_new_scores; } } if ($simple_and && $simple_or) { $this->simple = FALSE; } // Negative matches. foreach ($this->keys['negative'] as $key) { $this->conditions->condition('d.data', "% {$key} %", 'NOT LIKE'); $this->simple = FALSE; } }