/** * Method to process the query input string and extract required, optional, * and excluded tokens; taxonomy filters; and date filters. * * @param string $input The query input string. * @param string $lang The query input language. * @param string $mode The query matching mode. * * @return boolean True on success. * * @since 2.5 * @throws Exception on database error. */ protected function processString($input, $lang, $mode) { // Clean up the input string. $input = html_entity_decode($input, ENT_QUOTES, 'UTF-8'); $input = JString::strtolower($input); $input = preg_replace('#\\s+#mi', ' ', $input); $input = JString::trim($input); $debug = JFactory::getConfig()->get('debug_lang'); /* * First, we need to handle string based modifiers. String based * modifiers could potentially include things like "category:blah" or * "before:2009-10-21" or "type:article", etc. */ $patterns = array('before' => JText::_('COM_FINDER_FILTER_WHEN_BEFORE'), 'after' => JText::_('COM_FINDER_FILTER_WHEN_AFTER')); // Add the taxonomy branch titles to the possible patterns. foreach (FinderIndexerTaxonomy::getBranchTitles() as $branch) { // Add the pattern. $patterns[$branch] = JString::strtolower(JText::_(FinderHelperLanguage::branchSingular($branch))); } // Container for search terms and phrases. $terms = array(); $phrases = array(); // Cleared filter branches. $cleared = array(); /* * Compile the suffix pattern. This is used to match the values of the * filter input string. Single words can be input directly, multi-word * values have to be wrapped in double quotes. */ $quotes = html_entity_decode('‘’'', ENT_QUOTES, 'UTF-8'); $suffix = '(([\\w\\d' . $quotes . '-]+)|\\"([\\w\\d\\s' . $quotes . '-]+)\\")'; /* * Iterate through the possible filter patterns and search for matches. * We need to match the key, colon, and a value pattern for the match * to be valid. */ foreach ($patterns as $modifier => $pattern) { $matches = array(); if ($debug) { $pattern = substr($pattern, 2, -2); } // Check if the filter pattern is in the input string. if (preg_match('#' . $pattern . '\\s*:\\s*' . $suffix . '#mi', $input, $matches)) { // Get the value given to the modifier. $value = isset($matches[3]) ? $matches[3] : $matches[1]; // Now we have to handle the filter string. switch ($modifier) { // Handle a before and after date filters. case 'before': case 'after': // Get the time offset. $offset = JFactory::getApplication()->get('offset'); // Array of allowed when values. $whens = array('before', 'after', 'exact'); // The value of 'today' is a special case that we need to handle. if ($value === JString::strtolower(JText::_('COM_FINDER_QUERY_FILTER_TODAY'))) { $today = JFactory::getDate('now', $offset); $value = $today->format('%Y-%m-%d'); } // Try to parse the date string. $date = JFactory::getDate($value, $offset); // Check if the date was parsed successfully. if ($date->toUnix() !== null) { // Set the date filter. $this->date1 = $date->toSQL(); $this->when1 = in_array($modifier, $whens) ? $modifier : 'before'; } break; // Handle a taxonomy branch filter. // Handle a taxonomy branch filter. default: // Try to find the node id. $return = FinderIndexerTaxonomy::getNodeByTitle($modifier, $value); // Check if the node id was found. if ($return) { // Check if the branch has been cleared. if (!in_array($modifier, $cleared)) { // Clear the branch. $this->filters[$modifier] = array(); // Add the branch to the cleared list. $cleared[] = $modifier; } // Add the filter to the list. $this->filters[$modifier][$return->title] = (int) $return->id; } break; } // Clean up the input string again. $input = str_replace($matches[0], '', $input); $input = preg_replace('#\\s+#mi', ' ', $input); $input = JString::trim($input); } } /* * Extract the tokens enclosed in double quotes so that we can handle * them as phrases. */ if (JString::strpos($input, '"') !== false) { $matches = array(); // Extract the tokens enclosed in double quotes. if (preg_match_all('#\\"([^"]+)\\"#mi', $input, $matches)) { /* * One or more phrases were found so we need to iterate through * them, tokenize them as phrases, and remove them from the raw * input string before we move on to the next processing step. */ foreach ($matches[1] as $key => $match) { // Find the complete phrase in the input string. $pos = JString::strpos($input, $matches[0][$key]); $len = JString::strlen($matches[0][$key]); // Add any terms that are before this phrase to the stack. if (JString::trim(JString::substr($input, 0, $pos))) { $terms = array_merge($terms, explode(' ', JString::trim(JString::substr($input, 0, $pos)))); } // Strip out everything up to and including the phrase. $input = JString::substr($input, $pos + $len); // Clean up the input string again. $input = preg_replace('#\\s+#mi', ' ', $input); $input = JString::trim($input); // Get the number of words in the phrase. $parts = explode(' ', $match); // Check if the phrase is longer than three words. if (count($parts) > 3) { /* * If the phrase is longer than three words, we need to * break it down into smaller chunks of phrases that * are less than or equal to three words. We overlap * the chunks so that we can ensure that a match is * found for the complete phrase and not just portions * of it. */ for ($i = 0, $c = count($parts); $i < $c; $i += 2) { // Set up the chunk. $chunk = array(); // The chunk has to be assembled based on how many // pieces are available to use. switch ($c - $i) { /* * If only one word is left, we can break from * the switch and loop because the last word * was already used at the end of the last * chunk. */ case 1: break 2; // If there words are left, we use them both as // the last chunk of the phrase and we're done. // If there words are left, we use them both as // the last chunk of the phrase and we're done. case 2: $chunk[] = $parts[$i]; $chunk[] = $parts[$i + 1]; break; // If there are three or more words left, we // build a three word chunk and continue on. // If there are three or more words left, we // build a three word chunk and continue on. default: $chunk[] = $parts[$i]; $chunk[] = $parts[$i + 1]; $chunk[] = $parts[$i + 2]; break; } // If the chunk is not empty, add it as a phrase. if (count($chunk)) { $phrases[] = implode(' ', $chunk); $terms[] = implode(' ', $chunk); } } } else { // The phrase is <= 3 words so we can use it as is. $phrases[] = $match; $terms[] = $match; } } } } // Add the remaining terms if present. if (!empty($input)) { $terms = array_merge($terms, explode(' ', $input)); } // An array of our boolean operators. $operator => $translation $operators = array('AND' => JString::strtolower(JText::_('COM_FINDER_QUERY_OPERATOR_AND')), 'OR' => JString::strtolower(JText::_('COM_FINDER_QUERY_OPERATOR_OR')), 'NOT' => JString::strtolower(JText::_('COM_FINDER_QUERY_OPERATOR_NOT'))); // If language debugging is enabled you need to ignore the debug strings in matching. if (JDEBUG) { $debugStrings = array('**', '??'); $operators = str_replace($debugStrings, '', $operators); } /* * Iterate through the terms and perform any sorting that needs to be * done based on boolean search operators. Terms that are before an * and/or/not modifier have to be handled in relation to their operator. */ for ($i = 0, $c = count($terms); $i < $c; $i++) { // Check if the term is followed by an operator that we understand. if (isset($terms[$i + 1]) && in_array($terms[$i + 1], $operators)) { // Get the operator mode. $op = array_search($terms[$i + 1], $operators); // Handle the AND operator. if ($op === 'AND' && isset($terms[$i + 2])) { // Tokenize the current term. $token = FinderIndexerHelper::tokenize($terms[$i], $lang, true); $token = $this->getTokenData($token); // Set the required flag. $token->required = true; // Add the current token to the stack. $this->included[] = $token; $this->highlight = array_merge($this->highlight, array_keys($token->matches)); // Skip the next token (the mode operator). $this->operators[] = $terms[$i + 1]; // Tokenize the term after the next term (current plus two). $other = FinderIndexerHelper::tokenize($terms[$i + 2], $lang, true); $other = $this->getTokenData($other); // Set the required flag. $other->required = true; // Add the token after the next token to the stack. $this->included[] = $other; $this->highlight = array_merge($this->highlight, array_keys($other->matches)); // Remove the processed phrases if possible. if (($pk = array_search($terms[$i], $phrases)) !== false) { unset($phrases[$pk]); } if (($pk = array_search($terms[$i + 2], $phrases)) !== false) { unset($phrases[$pk]); } // Remove the processed terms. unset($terms[$i]); unset($terms[$i + 1]); unset($terms[$i + 2]); // Adjust the loop. $i += 2; continue; } elseif ($op === 'OR' && isset($terms[$i + 2])) { // Tokenize the current term. $token = FinderIndexerHelper::tokenize($terms[$i], $lang, true); $token = $this->getTokenData($token); // Set the required flag. $token->required = false; // Add the current token to the stack. if (count($token->matches)) { $this->included[] = $token; $this->highlight = array_merge($this->highlight, array_keys($token->matches)); } else { $this->ignored[] = $token; } // Skip the next token (the mode operator). $this->operators[] = $terms[$i + 1]; // Tokenize the term after the next term (current plus two). $other = FinderIndexerHelper::tokenize($terms[$i + 2], $lang, true); $other = $this->getTokenData($other); // Set the required flag. $other->required = false; // Add the token after the next token to the stack. if (count($other->matches)) { $this->included[] = $other; $this->highlight = array_merge($this->highlight, array_keys($other->matches)); } else { $this->ignored[] = $other; } // Remove the processed phrases if possible. if (($pk = array_search($terms[$i], $phrases)) !== false) { unset($phrases[$pk]); } if (($pk = array_search($terms[$i + 2], $phrases)) !== false) { unset($phrases[$pk]); } // Remove the processed terms. unset($terms[$i]); unset($terms[$i + 1]); unset($terms[$i + 2]); // Adjust the loop. $i += 2; continue; } } elseif (isset($terms[$i + 1]) && array_search($terms[$i], $operators) === 'OR') { // Skip the next token (the mode operator). $this->operators[] = $terms[$i]; // Tokenize the next term (current plus one). $other = FinderIndexerHelper::tokenize($terms[$i + 1], $lang, true); $other = $this->getTokenData($other); // Set the required flag. $other->required = false; // Add the token after the next token to the stack. if (count($other->matches)) { $this->included[] = $other; $this->highlight = array_merge($this->highlight, array_keys($other->matches)); } else { $this->ignored[] = $other; } // Remove the processed phrase if possible. if (($pk = array_search($terms[$i + 1], $phrases)) !== false) { unset($phrases[$pk]); } // Remove the processed terms. unset($terms[$i]); unset($terms[$i + 1]); // Adjust the loop. $i += 1; continue; } elseif (isset($terms[$i + 1]) && array_search($terms[$i], $operators) === 'NOT') { // Skip the next token (the mode operator). $this->operators[] = $terms[$i]; // Tokenize the next term (current plus one). $other = FinderIndexerHelper::tokenize($terms[$i + 1], $lang, true); $other = $this->getTokenData($other); // Set the required flag. $other->required = false; // Add the next token to the stack. if (count($other->matches)) { $this->excluded[] = $other; } else { $this->ignored[] = $other; } // Remove the processed phrase if possible. if (($pk = array_search($terms[$i + 1], $phrases)) !== false) { unset($phrases[$pk]); } // Remove the processed terms. unset($terms[$i]); unset($terms[$i + 1]); // Adjust the loop. $i += 1; continue; } } /* * Iterate through any search phrases and tokenize them. We handle * phrases as autonomous units and do not break them down into two and * three word combinations. */ for ($i = 0, $c = count($phrases); $i < $c; $i++) { // Tokenize the phrase. $token = FinderIndexerHelper::tokenize($phrases[$i], $lang, true); $token = $this->getTokenData($token); // Set the required flag. $token->required = true; // Add the current token to the stack. $this->included[] = $token; $this->highlight = array_merge($this->highlight, array_keys($token->matches)); // Remove the processed term if possible. if (($pk = array_search($phrases[$i], $terms)) !== false) { unset($terms[$pk]); } // Remove the processed phrase. unset($phrases[$i]); } /* * Handle any remaining tokens using the standard processing mechanism. */ if (!empty($terms)) { // Tokenize the terms. $terms = implode(' ', $terms); $tokens = FinderIndexerHelper::tokenize($terms, $lang, false); // Make sure we are working with an array. $tokens = is_array($tokens) ? $tokens : array($tokens); // Get the token data and required state for all the tokens. foreach ($tokens as $token) { // Get the token data. $token = $this->getTokenData($token); // Set the required flag for the token. $token->required = $mode === 'AND' ? $token->phrase ? false : true : false; // Add the token to the appropriate stack. if (count($token->matches) || $token->required) { $this->included[] = $token; $this->highlight = array_merge($this->highlight, array_keys($token->matches)); } else { $this->ignored[] = $token; } } } return true; }
/** * Method to parse input, tokenize it, and then add it to the database. * * @param mixed $input String or resource to use as input. A resource * input will automatically be chunked to conserve * memory. Strings will be chunked if longer than * 2K in size. * @param integer $context The context of the input. See context constants. * @param string $lang The language of the input. * @param string $format The format of the input. * * @return integer The number of tokens extracted from the input. * * @since 2.5 */ protected function tokenizeToDb($input, $context, $lang, $format) { $count = 0; $buffer = null; if (!empty($input)) { // If the input is a resource, batch the process out. if (is_resource($input)) { // Batch the process out to avoid memory limits. while (!feof($input)) { // Read into the buffer. $buffer .= fread($input, 2048); /* * If we haven't reached the end of the file, seek to the last * space character and drop whatever is after that to make sure * we didn't truncate a term while reading the input. */ if (!feof($input)) { // Find the last space character. $ls = strrpos($buffer, ' '); // Adjust string based on the last space character. if ($ls) { // Truncate the string to the last space character. $string = substr($buffer, 0, $ls); // Adjust the buffer based on the last space for the next iteration and trim. $buffer = JString::trim(substr($buffer, $ls)); } else { $string = $buffer; } } else { $string = $buffer; } // Parse the input. $string = FinderIndexerHelper::parse($string, $format); // Check the input. if (empty($string)) { continue; } // Tokenize the input. $tokens = FinderIndexerHelper::tokenize($string, $lang); // Add the tokens to the database. $count += $this->addTokensToDb($tokens, $context); // Check if we're approaching the memory limit of the token table. if ($count > self::$state->options->get('memory_table_limit', 30000)) { $this->toggleTables(false); } unset($string); unset($tokens); } } elseif (strlen($input) > 2048) { $start = 0; $end = strlen($input); $chunk = 2048; /* * As it turns out, the complex regular expressions we use for * sanitizing input are not very efficient when given large * strings. It is much faster to process lots of short strings. */ while ($start < $end) { // Setup the string. $string = substr($input, $start, $chunk); // Find the last space character if we aren't at the end. $ls = $start + $chunk < $end ? strrpos($string, ' ') : false; // Truncate to the last space character. if ($ls !== false) { $string = substr($string, 0, $ls); } // Adjust the start position for the next iteration. $start += $ls !== false ? $ls + 1 - $chunk + $chunk : $chunk; // Parse the input. $string = FinderIndexerHelper::parse($string, $format); // Check the input. if (empty($string)) { continue; } // Tokenize the input. $tokens = FinderIndexerHelper::tokenize($string, $lang); // Add the tokens to the database. $count += $this->addTokensToDb($tokens, $context); // Check if we're approaching the memory limit of the token table. if ($count > self::$state->options->get('memory_table_limit', 30000)) { $this->toggleTables(false); } } } else { // Parse the input. $input = FinderIndexerHelper::parse($input, $format); // Check the input. if (empty($input)) { return $count; } // Tokenize the input. $tokens = FinderIndexerHelper::tokenize($input, $lang); // Add the tokens to the database. $count = $this->addTokensToDb($tokens, $context); } } return $count; }