PHP FinderIndexerHelper::tokenize Exemples

Langage de programmation: PHP

Méthode/Fonction: tokenize

Exemples au hotexamples.com: 2

PHP FinderIndexerHelper::tokenize - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de FinderIndexerHelper::tokenize extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

getContentExtras(21)

getContentPath(21)

prepareContent(11)

stemmer(6)

getDefaultLanguage(5)

getPrimaryLanguage(5)

parse(3)

addContentType(2)

isCommon(2)

stem(2)

tokenize(2)

getCommonWords(1)

Méthodes fréquemment utilisées

getContentExtras (21)

getContentPath (21)

prepareContent (11)

stemmer (6)

getDefaultLanguage (5)

getPrimaryLanguage (5)

parse (3)

addContentType (2)

isCommon (2)

stem (2)

Méthodes fréquemment utilisées

tokenize (2)

getCommonWords (1)

Associées

triple

FusinvDB

fn_hw_action

Connection

unifyattributionline

HINT_BOL_Service

fla_clientes

die_query

Helper

print_filter_plugin_field

Related in langs

DiscoveryClientProtocol (C#)

HtmlReader (C#)

IsScripted (C++)

SDL_zerop (C++)

CleanupDatastore (Go)

NewInbox (Go)

InputKeyboardUtil (Java)

AvailableTableItemAction (Java)

update (Python)

solve_quals (Python)

Exemple #1

0

Afficher le fichier

Fichier : query.php Projet : 01J/topm

/** * Method to process the query input string and extract required, optional, * and excluded tokens; taxonomy filters; and date filters. * * @param string $input The query input string. * @param string $lang The query input language. * @param string $mode The query matching mode. * * @return boolean True on success. * * @since 2.5 * @throws Exception on database error. */ protected function processString($input, $lang, $mode) { // Clean up the input string. $input = html_entity_decode($input, ENT_QUOTES, 'UTF-8'); $input = JString::strtolower($input); $input = preg_replace('#\\s+#mi', ' ', $input); $input = JString::trim($input); $debug = JFactory::getConfig()->get('debug_lang'); /* * First, we need to handle string based modifiers. String based * modifiers could potentially include things like "category:blah" or * "before:2009-10-21" or "type:article", etc. */ $patterns = array('before' => JText::_('COM_FINDER_FILTER_WHEN_BEFORE'), 'after' => JText::_('COM_FINDER_FILTER_WHEN_AFTER')); // Add the taxonomy branch titles to the possible patterns. foreach (FinderIndexerTaxonomy::getBranchTitles() as $branch) { // Add the pattern. $patterns[$branch] = JString::strtolower(JText::_(FinderHelperLanguage::branchSingular($branch))); } // Container for search terms and phrases. $terms = array(); $phrases = array(); // Cleared filter branches. $cleared = array(); /* * Compile the suffix pattern. This is used to match the values of the * filter input string. Single words can be input directly, multi-word * values have to be wrapped in double quotes. */ $quotes = html_entity_decode('‘’'', ENT_QUOTES, 'UTF-8'); $suffix = '(([\\w\\d' . $quotes . '-]+)|\\"([\\w\\d\\s' . $quotes . '-]+)\\")'; /* * Iterate through the possible filter patterns and search for matches. * We need to match the key, colon, and a value pattern for the match * to be valid. */ foreach ($patterns as $modifier => $pattern) { $matches = array(); if ($debug) { $pattern = substr($pattern, 2, -2); } // Check if the filter pattern is in the input string. if (preg_match('#' . $pattern . '\\s*:\\s*' . $suffix . '#mi', $input, $matches)) { // Get the value given to the modifier. $value = isset($matches[3]) ? $matches[3] : $matches[1]; // Now we have to handle the filter string. switch ($modifier) { // Handle a before and after date filters. case 'before': case 'after': // Get the time offset. $offset = JFactory::getApplication()->get('offset'); // Array of allowed when values. $whens = array('before', 'after', 'exact'); // The value of 'today' is a special case that we need to handle. if ($value === JString::strtolower(JText::_('COM_FINDER_QUERY_FILTER_TODAY'))) { $today = JFactory::getDate('now', $offset); $value = $today->format('%Y-%m-%d'); } // Try to parse the date string. $date = JFactory::getDate($value, $offset); // Check if the date was parsed successfully. if ($date->toUnix() !== null) { // Set the date filter. $this->date1 = $date->toSQL(); $this->when1 = in_array($modifier, $whens) ? $modifier : 'before'; } break; // Handle a taxonomy branch filter. // Handle a taxonomy branch filter. default: // Try to find the node id. $return = FinderIndexerTaxonomy::getNodeByTitle($modifier, $value); // Check if the node id was found. if ($return) { // Check if the branch has been cleared. if (!in_array($modifier, $cleared)) { // Clear the branch. $this->filters[$modifier] = array(); // Add the branch to the cleared list. $cleared[] = $modifier; } // Add the filter to the list. $this->filters[$modifier][$return->title] = (int) $return->id; } break; } // Clean up the input string again. $input = str_replace($matches[0], '', $input); $input = preg_replace('#\\s+#mi', ' ', $input); $input = JString::trim($input); } } /* * Extract the tokens enclosed in double quotes so that we can handle * them as phrases. */ if (JString::strpos($input, '"') !== false) { $matches = array(); // Extract the tokens enclosed in double quotes. if (preg_match_all('#\\"([^"]+)\\"#mi', $input, $matches)) { /* * One or more phrases were found so we need to iterate through * them, tokenize them as phrases, and remove them from the raw * input string before we move on to the next processing step. */ foreach ($matches[1] as $key => $match) { // Find the complete phrase in the input string. $pos = JString::strpos($input, $matches[0][$key]); $len = JString::strlen($matches[0][$key]); // Add any terms that are before this phrase to the stack. if (JString::trim(JString::substr($input, 0, $pos))) { $terms = array_merge($terms, explode(' ', JString::trim(JString::substr($input, 0, $pos)))); } // Strip out everything up to and including the phrase. $input = JString::substr($input, $pos + $len); // Clean up the input string again. $input = preg_replace('#\\s+#mi', ' ', $input); $input = JString::trim($input); // Get the number of words in the phrase. $parts = explode(' ', $match); // Check if the phrase is longer than three words. if (count($parts) > 3) { /* * If the phrase is longer than three words, we need to * break it down into smaller chunks of phrases that * are less than or equal to three words. We overlap * the chunks so that we can ensure that a match is * found for the complete phrase and not just portions * of it. */ for ($i = 0, $c = count($parts); $i < $c; $i += 2) { // Set up the chunk. $chunk = array(); // The chunk has to be assembled based on how many // pieces are available to use. switch ($c - $i) { /* * If only one word is left, we can break from * the switch and loop because the last word * was already used at the end of the last * chunk. */ case 1: break 2; // If there words are left, we use them both as // the last chunk of the phrase and we're done. // If there words are left, we use them both as // the last chunk of the phrase and we're done. case 2: $chunk[] = $parts[$i]; $chunk[] = $parts[$i + 1]; break; // If there are three or more words left, we // build a three word chunk and continue on. // If there are three or more words left, we // build a three word chunk and continue on. default: $chunk[] = $parts[$i]; $chunk[] = $parts[$i + 1]; $chunk[] = $parts[$i + 2]; break; } // If the chunk is not empty, add it as a phrase. if (count($chunk)) { $phrases[] = implode(' ', $chunk); $terms[] = implode(' ', $chunk); } } } else { // The phrase is <= 3 words so we can use it as is. $phrases[] = $match; $terms[] = $match; } } } } // Add the remaining terms if present. if (!empty($input)) { $terms = array_merge($terms, explode(' ', $input)); } // An array of our boolean operators. $operator => $translation $operators = array('AND' => JString::strtolower(JText::_('COM_FINDER_QUERY_OPERATOR_AND')), 'OR' => JString::strtolower(JText::_('COM_FINDER_QUERY_OPERATOR_OR')), 'NOT' => JString::strtolower(JText::_('COM_FINDER_QUERY_OPERATOR_NOT'))); // If language debugging is enabled you need to ignore the debug strings in matching. if (JDEBUG) { $debugStrings = array('**', '??'); $operators = str_replace($debugStrings, '', $operators); } /* * Iterate through the terms and perform any sorting that needs to be * done based on boolean search operators. Terms that are before an * and/or/not modifier have to be handled in relation to their operator. */ for ($i = 0, $c = count($terms); $i < $c; $i++) { // Check if the term is followed by an operator that we understand. if (isset($terms[$i + 1]) && in_array($terms[$i + 1], $operators)) { // Get the operator mode. $op = array_search($terms[$i + 1], $operators); // Handle the AND operator. if ($op === 'AND' && isset($terms[$i + 2])) { // Tokenize the current term. $token = FinderIndexerHelper::tokenize($terms[$i], $lang, true); $token = $this->getTokenData($token); // Set the required flag. $token->required = true; // Add the current token to the stack. $this->included[] = $token; $this->highlight = array_merge($this->highlight, array_keys($token->matches)); // Skip the next token (the mode operator). $this->operators[] = $terms[$i + 1]; // Tokenize the term after the next term (current plus two). $other = FinderIndexerHelper::tokenize($terms[$i + 2], $lang, true); $other = $this->getTokenData($other); // Set the required flag. $other->required = true; // Add the token after the next token to the stack. $this->included[] = $other; $this->highlight = array_merge($this->highlight, array_keys($other->matches)); // Remove the processed phrases if possible. if (($pk = array_search($terms[$i], $phrases)) !== false) { unset($phrases[$pk]); } if (($pk = array_search($terms[$i + 2], $phrases)) !== false) { unset($phrases[$pk]); } // Remove the processed terms. unset($terms[$i]); unset($terms[$i + 1]); unset($terms[$i + 2]); // Adjust the loop. $i += 2; continue; } elseif ($op === 'OR' && isset($terms[$i + 2])) { // Tokenize the current term. $token = FinderIndexerHelper::tokenize($terms[$i], $lang, true); $token = $this->getTokenData($token); // Set the required flag. $token->required = false; // Add the current token to the stack. if (count($token->matches)) { $this->included[] = $token; $this->highlight = array_merge($this->highlight, array_keys($token->matches)); } else { $this->ignored[] = $token; } // Skip the next token (the mode operator). $this->operators[] = $terms[$i + 1]; // Tokenize the term after the next term (current plus two). $other = FinderIndexerHelper::tokenize($terms[$i + 2], $lang, true); $other = $this->getTokenData($other); // Set the required flag. $other->required = false; // Add the token after the next token to the stack. if (count($other->matches)) { $this->included[] = $other; $this->highlight = array_merge($this->highlight, array_keys($other->matches)); } else { $this->ignored[] = $other; } // Remove the processed phrases if possible. if (($pk = array_search($terms[$i], $phrases)) !== false) { unset($phrases[$pk]); } if (($pk = array_search($terms[$i + 2], $phrases)) !== false) { unset($phrases[$pk]); } // Remove the processed terms. unset($terms[$i]); unset($terms[$i + 1]); unset($terms[$i + 2]); // Adjust the loop. $i += 2; continue; } } elseif (isset($terms[$i + 1]) && array_search($terms[$i], $operators) === 'OR') { // Skip the next token (the mode operator). $this->operators[] = $terms[$i]; // Tokenize the next term (current plus one). $other = FinderIndexerHelper::tokenize($terms[$i + 1], $lang, true); $other = $this->getTokenData($other); // Set the required flag. $other->required = false; // Add the token after the next token to the stack. if (count($other->matches)) { $this->included[] = $other; $this->highlight = array_merge($this->highlight, array_keys($other->matches)); } else { $this->ignored[] = $other; } // Remove the processed phrase if possible. if (($pk = array_search($terms[$i + 1], $phrases)) !== false) { unset($phrases[$pk]); } // Remove the processed terms. unset($terms[$i]); unset($terms[$i + 1]); // Adjust the loop. $i += 1; continue; } elseif (isset($terms[$i + 1]) && array_search($terms[$i], $operators) === 'NOT') { // Skip the next token (the mode operator). $this->operators[] = $terms[$i]; // Tokenize the next term (current plus one). $other = FinderIndexerHelper::tokenize($terms[$i + 1], $lang, true); $other = $this->getTokenData($other); // Set the required flag. $other->required = false; // Add the next token to the stack. if (count($other->matches)) { $this->excluded[] = $other; } else { $this->ignored[] = $other; } // Remove the processed phrase if possible. if (($pk = array_search($terms[$i + 1], $phrases)) !== false) { unset($phrases[$pk]); } // Remove the processed terms. unset($terms[$i]); unset($terms[$i + 1]); // Adjust the loop. $i += 1; continue; } } /* * Iterate through any search phrases and tokenize them. We handle * phrases as autonomous units and do not break them down into two and * three word combinations. */ for ($i = 0, $c = count($phrases); $i < $c; $i++) { // Tokenize the phrase. $token = FinderIndexerHelper::tokenize($phrases[$i], $lang, true); $token = $this->getTokenData($token); // Set the required flag. $token->required = true; // Add the current token to the stack. $this->included[] = $token; $this->highlight = array_merge($this->highlight, array_keys($token->matches)); // Remove the processed term if possible. if (($pk = array_search($phrases[$i], $terms)) !== false) { unset($terms[$pk]); } // Remove the processed phrase. unset($phrases[$i]); } /* * Handle any remaining tokens using the standard processing mechanism. */ if (!empty($terms)) { // Tokenize the terms. $terms = implode(' ', $terms); $tokens = FinderIndexerHelper::tokenize($terms, $lang, false); // Make sure we are working with an array. $tokens = is_array($tokens) ? $tokens : array($tokens); // Get the token data and required state for all the tokens. foreach ($tokens as $token) { // Get the token data. $token = $this->getTokenData($token); // Set the required flag for the token. $token->required = $mode === 'AND' ? $token->phrase ? false : true : false; // Add the token to the appropriate stack. if (count($token->matches) || $token->required) { $this->included[] = $token; $this->highlight = array_merge($this->highlight, array_keys($token->matches)); } else { $this->ignored[] = $token; } } } return true; }

Exemple #2

0

Afficher le fichier

Fichier : indexer.php Projet : brenot/forumdesenvolvimento

/** * Method to parse input, tokenize it, and then add it to the database. * * @param mixed $input String or resource to use as input. A resource * input will automatically be chunked to conserve * memory. Strings will be chunked if longer than * 2K in size. * @param integer $context The context of the input. See context constants. * @param string $lang The language of the input. * @param string $format The format of the input. * * @return integer The number of tokens extracted from the input. * * @since 2.5 */ protected function tokenizeToDb($input, $context, $lang, $format) { $count = 0; $buffer = null; if (!empty($input)) { // If the input is a resource, batch the process out. if (is_resource($input)) { // Batch the process out to avoid memory limits. while (!feof($input)) { // Read into the buffer. $buffer .= fread($input, 2048); /* * If we haven't reached the end of the file, seek to the last * space character and drop whatever is after that to make sure * we didn't truncate a term while reading the input. */ if (!feof($input)) { // Find the last space character. $ls = strrpos($buffer, ' '); // Adjust string based on the last space character. if ($ls) { // Truncate the string to the last space character. $string = substr($buffer, 0, $ls); // Adjust the buffer based on the last space for the next iteration and trim. $buffer = JString::trim(substr($buffer, $ls)); } else { $string = $buffer; } } else { $string = $buffer; } // Parse the input. $string = FinderIndexerHelper::parse($string, $format); // Check the input. if (empty($string)) { continue; } // Tokenize the input. $tokens = FinderIndexerHelper::tokenize($string, $lang); // Add the tokens to the database. $count += $this->addTokensToDb($tokens, $context); // Check if we're approaching the memory limit of the token table. if ($count > self::$state->options->get('memory_table_limit', 30000)) { $this->toggleTables(false); } unset($string); unset($tokens); } } elseif (strlen($input) > 2048) { $start = 0; $end = strlen($input); $chunk = 2048; /* * As it turns out, the complex regular expressions we use for * sanitizing input are not very efficient when given large * strings. It is much faster to process lots of short strings. */ while ($start < $end) { // Setup the string. $string = substr($input, $start, $chunk); // Find the last space character if we aren't at the end. $ls = $start + $chunk < $end ? strrpos($string, ' ') : false; // Truncate to the last space character. if ($ls !== false) { $string = substr($string, 0, $ls); } // Adjust the start position for the next iteration. $start += $ls !== false ? $ls + 1 - $chunk + $chunk : $chunk; // Parse the input. $string = FinderIndexerHelper::parse($string, $format); // Check the input. if (empty($string)) { continue; } // Tokenize the input. $tokens = FinderIndexerHelper::tokenize($string, $lang); // Add the tokens to the database. $count += $this->addTokensToDb($tokens, $context); // Check if we're approaching the memory limit of the token table. if ($count > self::$state->options->get('memory_table_limit', 30000)) { $this->toggleTables(false); } } } else { // Parse the input. $input = FinderIndexerHelper::parse($input, $format); // Check the input. if (empty($input)) { return $count; } // Tokenize the input. $tokens = FinderIndexerHelper::tokenize($input, $lang); // Add the tokens to the database. $count = $this->addTokensToDb($tokens, $context); } } return $count; }