Ejemplo n.º 1
0
Archivo: query.php Proyecto: 01J/topm
 /**
  * Method to process the query input string and extract required, optional,
  * and excluded tokens; taxonomy filters; and date filters.
  *
  * @param   string  $input  The query input string.
  * @param   string  $lang   The query input language.
  * @param   string  $mode   The query matching mode.
  *
  * @return  boolean  True on success.
  *
  * @since   2.5
  * @throws  Exception on database error.
  */
 protected function processString($input, $lang, $mode)
 {
     // Clean up the input string.
     $input = html_entity_decode($input, ENT_QUOTES, 'UTF-8');
     $input = JString::strtolower($input);
     $input = preg_replace('#\\s+#mi', ' ', $input);
     $input = JString::trim($input);
     $debug = JFactory::getConfig()->get('debug_lang');
     /*
      * First, we need to handle string based modifiers. String based
      * modifiers could potentially include things like "category:blah" or
      * "before:2009-10-21" or "type:article", etc.
      */
     $patterns = array('before' => JText::_('COM_FINDER_FILTER_WHEN_BEFORE'), 'after' => JText::_('COM_FINDER_FILTER_WHEN_AFTER'));
     // Add the taxonomy branch titles to the possible patterns.
     foreach (FinderIndexerTaxonomy::getBranchTitles() as $branch) {
         // Add the pattern.
         $patterns[$branch] = JString::strtolower(JText::_(FinderHelperLanguage::branchSingular($branch)));
     }
     // Container for search terms and phrases.
     $terms = array();
     $phrases = array();
     // Cleared filter branches.
     $cleared = array();
     /*
      * Compile the suffix pattern. This is used to match the values of the
      * filter input string. Single words can be input directly, multi-word
      * values have to be wrapped in double quotes.
      */
     $quotes = html_entity_decode('‘’'', ENT_QUOTES, 'UTF-8');
     $suffix = '(([\\w\\d' . $quotes . '-]+)|\\"([\\w\\d\\s' . $quotes . '-]+)\\")';
     /*
      * Iterate through the possible filter patterns and search for matches.
      * We need to match the key, colon, and a value pattern for the match
      * to be valid.
      */
     foreach ($patterns as $modifier => $pattern) {
         $matches = array();
         if ($debug) {
             $pattern = substr($pattern, 2, -2);
         }
         // Check if the filter pattern is in the input string.
         if (preg_match('#' . $pattern . '\\s*:\\s*' . $suffix . '#mi', $input, $matches)) {
             // Get the value given to the modifier.
             $value = isset($matches[3]) ? $matches[3] : $matches[1];
             // Now we have to handle the filter string.
             switch ($modifier) {
                 // Handle a before and after date filters.
                 case 'before':
                 case 'after':
                     // Get the time offset.
                     $offset = JFactory::getApplication()->get('offset');
                     // Array of allowed when values.
                     $whens = array('before', 'after', 'exact');
                     // The value of 'today' is a special case that we need to handle.
                     if ($value === JString::strtolower(JText::_('COM_FINDER_QUERY_FILTER_TODAY'))) {
                         $today = JFactory::getDate('now', $offset);
                         $value = $today->format('%Y-%m-%d');
                     }
                     // Try to parse the date string.
                     $date = JFactory::getDate($value, $offset);
                     // Check if the date was parsed successfully.
                     if ($date->toUnix() !== null) {
                         // Set the date filter.
                         $this->date1 = $date->toSQL();
                         $this->when1 = in_array($modifier, $whens) ? $modifier : 'before';
                     }
                     break;
                     // Handle a taxonomy branch filter.
                 // Handle a taxonomy branch filter.
                 default:
                     // Try to find the node id.
                     $return = FinderIndexerTaxonomy::getNodeByTitle($modifier, $value);
                     // Check if the node id was found.
                     if ($return) {
                         // Check if the branch has been cleared.
                         if (!in_array($modifier, $cleared)) {
                             // Clear the branch.
                             $this->filters[$modifier] = array();
                             // Add the branch to the cleared list.
                             $cleared[] = $modifier;
                         }
                         // Add the filter to the list.
                         $this->filters[$modifier][$return->title] = (int) $return->id;
                     }
                     break;
             }
             // Clean up the input string again.
             $input = str_replace($matches[0], '', $input);
             $input = preg_replace('#\\s+#mi', ' ', $input);
             $input = JString::trim($input);
         }
     }
     /*
      * Extract the tokens enclosed in double quotes so that we can handle
      * them as phrases.
      */
     if (JString::strpos($input, '"') !== false) {
         $matches = array();
         // Extract the tokens enclosed in double quotes.
         if (preg_match_all('#\\"([^"]+)\\"#mi', $input, $matches)) {
             /*
              * One or more phrases were found so we need to iterate through
              * them, tokenize them as phrases, and remove them from the raw
              * input string before we move on to the next processing step.
              */
             foreach ($matches[1] as $key => $match) {
                 // Find the complete phrase in the input string.
                 $pos = JString::strpos($input, $matches[0][$key]);
                 $len = JString::strlen($matches[0][$key]);
                 // Add any terms that are before this phrase to the stack.
                 if (JString::trim(JString::substr($input, 0, $pos))) {
                     $terms = array_merge($terms, explode(' ', JString::trim(JString::substr($input, 0, $pos))));
                 }
                 // Strip out everything up to and including the phrase.
                 $input = JString::substr($input, $pos + $len);
                 // Clean up the input string again.
                 $input = preg_replace('#\\s+#mi', ' ', $input);
                 $input = JString::trim($input);
                 // Get the number of words in the phrase.
                 $parts = explode(' ', $match);
                 // Check if the phrase is longer than three words.
                 if (count($parts) > 3) {
                     /*
                      * If the phrase is longer than three words, we need to
                      * break it down into smaller chunks of phrases that
                      * are less than or equal to three words. We overlap
                      * the chunks so that we can ensure that a match is
                      * found for the complete phrase and not just portions
                      * of it.
                      */
                     for ($i = 0, $c = count($parts); $i < $c; $i += 2) {
                         // Set up the chunk.
                         $chunk = array();
                         // The chunk has to be assembled based on how many
                         // pieces are available to use.
                         switch ($c - $i) {
                             /*
                              * If only one word is left, we can break from
                              * the switch and loop because the last word
                              * was already used at the end of the last
                              * chunk.
                              */
                             case 1:
                                 break 2;
                                 // If there words are left, we use them both as
                                 // the last chunk of the phrase and we're done.
                             // If there words are left, we use them both as
                             // the last chunk of the phrase and we're done.
                             case 2:
                                 $chunk[] = $parts[$i];
                                 $chunk[] = $parts[$i + 1];
                                 break;
                                 // If there are three or more words left, we
                                 // build a three word chunk and continue on.
                             // If there are three or more words left, we
                             // build a three word chunk and continue on.
                             default:
                                 $chunk[] = $parts[$i];
                                 $chunk[] = $parts[$i + 1];
                                 $chunk[] = $parts[$i + 2];
                                 break;
                         }
                         // If the chunk is not empty, add it as a phrase.
                         if (count($chunk)) {
                             $phrases[] = implode(' ', $chunk);
                             $terms[] = implode(' ', $chunk);
                         }
                     }
                 } else {
                     // The phrase is <= 3 words so we can use it as is.
                     $phrases[] = $match;
                     $terms[] = $match;
                 }
             }
         }
     }
     // Add the remaining terms if present.
     if (!empty($input)) {
         $terms = array_merge($terms, explode(' ', $input));
     }
     // An array of our boolean operators. $operator => $translation
     $operators = array('AND' => JString::strtolower(JText::_('COM_FINDER_QUERY_OPERATOR_AND')), 'OR' => JString::strtolower(JText::_('COM_FINDER_QUERY_OPERATOR_OR')), 'NOT' => JString::strtolower(JText::_('COM_FINDER_QUERY_OPERATOR_NOT')));
     // If language debugging is enabled you need to ignore the debug strings in matching.
     if (JDEBUG) {
         $debugStrings = array('**', '??');
         $operators = str_replace($debugStrings, '', $operators);
     }
     /*
      * Iterate through the terms and perform any sorting that needs to be
      * done based on boolean search operators. Terms that are before an
      * and/or/not modifier have to be handled in relation to their operator.
      */
     for ($i = 0, $c = count($terms); $i < $c; $i++) {
         // Check if the term is followed by an operator that we understand.
         if (isset($terms[$i + 1]) && in_array($terms[$i + 1], $operators)) {
             // Get the operator mode.
             $op = array_search($terms[$i + 1], $operators);
             // Handle the AND operator.
             if ($op === 'AND' && isset($terms[$i + 2])) {
                 // Tokenize the current term.
                 $token = FinderIndexerHelper::tokenize($terms[$i], $lang, true);
                 $token = $this->getTokenData($token);
                 // Set the required flag.
                 $token->required = true;
                 // Add the current token to the stack.
                 $this->included[] = $token;
                 $this->highlight = array_merge($this->highlight, array_keys($token->matches));
                 // Skip the next token (the mode operator).
                 $this->operators[] = $terms[$i + 1];
                 // Tokenize the term after the next term (current plus two).
                 $other = FinderIndexerHelper::tokenize($terms[$i + 2], $lang, true);
                 $other = $this->getTokenData($other);
                 // Set the required flag.
                 $other->required = true;
                 // Add the token after the next token to the stack.
                 $this->included[] = $other;
                 $this->highlight = array_merge($this->highlight, array_keys($other->matches));
                 // Remove the processed phrases if possible.
                 if (($pk = array_search($terms[$i], $phrases)) !== false) {
                     unset($phrases[$pk]);
                 }
                 if (($pk = array_search($terms[$i + 2], $phrases)) !== false) {
                     unset($phrases[$pk]);
                 }
                 // Remove the processed terms.
                 unset($terms[$i]);
                 unset($terms[$i + 1]);
                 unset($terms[$i + 2]);
                 // Adjust the loop.
                 $i += 2;
                 continue;
             } elseif ($op === 'OR' && isset($terms[$i + 2])) {
                 // Tokenize the current term.
                 $token = FinderIndexerHelper::tokenize($terms[$i], $lang, true);
                 $token = $this->getTokenData($token);
                 // Set the required flag.
                 $token->required = false;
                 // Add the current token to the stack.
                 if (count($token->matches)) {
                     $this->included[] = $token;
                     $this->highlight = array_merge($this->highlight, array_keys($token->matches));
                 } else {
                     $this->ignored[] = $token;
                 }
                 // Skip the next token (the mode operator).
                 $this->operators[] = $terms[$i + 1];
                 // Tokenize the term after the next term (current plus two).
                 $other = FinderIndexerHelper::tokenize($terms[$i + 2], $lang, true);
                 $other = $this->getTokenData($other);
                 // Set the required flag.
                 $other->required = false;
                 // Add the token after the next token to the stack.
                 if (count($other->matches)) {
                     $this->included[] = $other;
                     $this->highlight = array_merge($this->highlight, array_keys($other->matches));
                 } else {
                     $this->ignored[] = $other;
                 }
                 // Remove the processed phrases if possible.
                 if (($pk = array_search($terms[$i], $phrases)) !== false) {
                     unset($phrases[$pk]);
                 }
                 if (($pk = array_search($terms[$i + 2], $phrases)) !== false) {
                     unset($phrases[$pk]);
                 }
                 // Remove the processed terms.
                 unset($terms[$i]);
                 unset($terms[$i + 1]);
                 unset($terms[$i + 2]);
                 // Adjust the loop.
                 $i += 2;
                 continue;
             }
         } elseif (isset($terms[$i + 1]) && array_search($terms[$i], $operators) === 'OR') {
             // Skip the next token (the mode operator).
             $this->operators[] = $terms[$i];
             // Tokenize the next term (current plus one).
             $other = FinderIndexerHelper::tokenize($terms[$i + 1], $lang, true);
             $other = $this->getTokenData($other);
             // Set the required flag.
             $other->required = false;
             // Add the token after the next token to the stack.
             if (count($other->matches)) {
                 $this->included[] = $other;
                 $this->highlight = array_merge($this->highlight, array_keys($other->matches));
             } else {
                 $this->ignored[] = $other;
             }
             // Remove the processed phrase if possible.
             if (($pk = array_search($terms[$i + 1], $phrases)) !== false) {
                 unset($phrases[$pk]);
             }
             // Remove the processed terms.
             unset($terms[$i]);
             unset($terms[$i + 1]);
             // Adjust the loop.
             $i += 1;
             continue;
         } elseif (isset($terms[$i + 1]) && array_search($terms[$i], $operators) === 'NOT') {
             // Skip the next token (the mode operator).
             $this->operators[] = $terms[$i];
             // Tokenize the next term (current plus one).
             $other = FinderIndexerHelper::tokenize($terms[$i + 1], $lang, true);
             $other = $this->getTokenData($other);
             // Set the required flag.
             $other->required = false;
             // Add the next token to the stack.
             if (count($other->matches)) {
                 $this->excluded[] = $other;
             } else {
                 $this->ignored[] = $other;
             }
             // Remove the processed phrase if possible.
             if (($pk = array_search($terms[$i + 1], $phrases)) !== false) {
                 unset($phrases[$pk]);
             }
             // Remove the processed terms.
             unset($terms[$i]);
             unset($terms[$i + 1]);
             // Adjust the loop.
             $i += 1;
             continue;
         }
     }
     /*
      * Iterate through any search phrases and tokenize them. We handle
      * phrases as autonomous units and do not break them down into two and
      * three word combinations.
      */
     for ($i = 0, $c = count($phrases); $i < $c; $i++) {
         // Tokenize the phrase.
         $token = FinderIndexerHelper::tokenize($phrases[$i], $lang, true);
         $token = $this->getTokenData($token);
         // Set the required flag.
         $token->required = true;
         // Add the current token to the stack.
         $this->included[] = $token;
         $this->highlight = array_merge($this->highlight, array_keys($token->matches));
         // Remove the processed term if possible.
         if (($pk = array_search($phrases[$i], $terms)) !== false) {
             unset($terms[$pk]);
         }
         // Remove the processed phrase.
         unset($phrases[$i]);
     }
     /*
      * Handle any remaining tokens using the standard processing mechanism.
      */
     if (!empty($terms)) {
         // Tokenize the terms.
         $terms = implode(' ', $terms);
         $tokens = FinderIndexerHelper::tokenize($terms, $lang, false);
         // Make sure we are working with an array.
         $tokens = is_array($tokens) ? $tokens : array($tokens);
         // Get the token data and required state for all the tokens.
         foreach ($tokens as $token) {
             // Get the token data.
             $token = $this->getTokenData($token);
             // Set the required flag for the token.
             $token->required = $mode === 'AND' ? $token->phrase ? false : true : false;
             // Add the token to the appropriate stack.
             if (count($token->matches) || $token->required) {
                 $this->included[] = $token;
                 $this->highlight = array_merge($this->highlight, array_keys($token->matches));
             } else {
                 $this->ignored[] = $token;
             }
         }
     }
     return true;
 }
Ejemplo n.º 2
0
 /**
  * Method to parse input, tokenize it, and then add it to the database.
  *
  * @param   mixed    $input    String or resource to use as input. A resource
  *                             input will automatically be chunked to conserve
  *                             memory. Strings will be chunked if longer than
  *                             2K in size.
  * @param   integer  $context  The context of the input. See context constants.
  * @param   string   $lang     The language of the input.
  * @param   string   $format   The format of the input.
  *
  * @return  integer  The number of tokens extracted from the input.
  *
  * @since   2.5
  */
 protected function tokenizeToDb($input, $context, $lang, $format)
 {
     $count = 0;
     $buffer = null;
     if (!empty($input)) {
         // If the input is a resource, batch the process out.
         if (is_resource($input)) {
             // Batch the process out to avoid memory limits.
             while (!feof($input)) {
                 // Read into the buffer.
                 $buffer .= fread($input, 2048);
                 /*
                  * If we haven't reached the end of the file, seek to the last
                  * space character and drop whatever is after that to make sure
                  * we didn't truncate a term while reading the input.
                  */
                 if (!feof($input)) {
                     // Find the last space character.
                     $ls = strrpos($buffer, ' ');
                     // Adjust string based on the last space character.
                     if ($ls) {
                         // Truncate the string to the last space character.
                         $string = substr($buffer, 0, $ls);
                         // Adjust the buffer based on the last space for the next iteration and trim.
                         $buffer = JString::trim(substr($buffer, $ls));
                     } else {
                         $string = $buffer;
                     }
                 } else {
                     $string = $buffer;
                 }
                 // Parse the input.
                 $string = FinderIndexerHelper::parse($string, $format);
                 // Check the input.
                 if (empty($string)) {
                     continue;
                 }
                 // Tokenize the input.
                 $tokens = FinderIndexerHelper::tokenize($string, $lang);
                 // Add the tokens to the database.
                 $count += $this->addTokensToDb($tokens, $context);
                 // Check if we're approaching the memory limit of the token table.
                 if ($count > self::$state->options->get('memory_table_limit', 30000)) {
                     $this->toggleTables(false);
                 }
                 unset($string);
                 unset($tokens);
             }
         } elseif (strlen($input) > 2048) {
             $start = 0;
             $end = strlen($input);
             $chunk = 2048;
             /*
              * As it turns out, the complex regular expressions we use for
              * sanitizing input are not very efficient when given large
              * strings. It is much faster to process lots of short strings.
              */
             while ($start < $end) {
                 // Setup the string.
                 $string = substr($input, $start, $chunk);
                 // Find the last space character if we aren't at the end.
                 $ls = $start + $chunk < $end ? strrpos($string, ' ') : false;
                 // Truncate to the last space character.
                 if ($ls !== false) {
                     $string = substr($string, 0, $ls);
                 }
                 // Adjust the start position for the next iteration.
                 $start += $ls !== false ? $ls + 1 - $chunk + $chunk : $chunk;
                 // Parse the input.
                 $string = FinderIndexerHelper::parse($string, $format);
                 // Check the input.
                 if (empty($string)) {
                     continue;
                 }
                 // Tokenize the input.
                 $tokens = FinderIndexerHelper::tokenize($string, $lang);
                 // Add the tokens to the database.
                 $count += $this->addTokensToDb($tokens, $context);
                 // Check if we're approaching the memory limit of the token table.
                 if ($count > self::$state->options->get('memory_table_limit', 30000)) {
                     $this->toggleTables(false);
                 }
             }
         } else {
             // Parse the input.
             $input = FinderIndexerHelper::parse($input, $format);
             // Check the input.
             if (empty($input)) {
                 return $count;
             }
             // Tokenize the input.
             $tokens = FinderIndexerHelper::tokenize($input, $lang);
             // Add the tokens to the database.
             $count = $this->addTokensToDb($tokens, $context);
         }
     }
     return $count;
 }