/** * Method to be called by another php script. Processes for XSS and * specified bad code. * * @param mixed $source Input string/array-of-string to be 'cleaned' * @param string $type The return type for the variable: * INT: An integer, or an array of integers, * UINT: An unsigned integer, or an array of unsigned integers, * FLOAT: A floating point number, or an array of floating point numbers, * BOOLEAN: A boolean value, * WORD: A string containing A-Z or underscores only (not case sensitive), * ALNUM: A string containing A-Z or 0-9 only (not case sensitive), * CMD: A string containing A-Z, 0-9, underscores, periods or hyphens (not case sensitive), * BASE64: A string containing A-Z, 0-9, forward slashes, plus or equals (not case sensitive), * STRING: A fully decoded and sanitised string (default), * HTML: A sanitised string, * ARRAY: An array, * PATH: A sanitised file path, or an array of sanitised file paths, * TRIM: A string trimmed from normal, non-breaking and multibyte spaces * USERNAME: Do not use (use an application specific filter), * RAW: The raw string is returned with no filtering, * unknown: An unknown filter will act like STRING. If the input is an array it will return an * array of fully decoded and sanitised strings. * * @return mixed 'Cleaned' version of input parameter * * @since 11.1 */ public function clean($source, $type = 'string') { // Strip Unicode Supplementary Characters when requested to do so if ($this->stripUSC) { // Alternatively: preg_replace('/[\x{10000}-\x{10FFFF}]/u', "\xE2\xAF\x91", $source) but it'd be slower. $source = $this->stripUSC($source); } // Handle the type constraint cases switch (strtoupper($type)) { case 'INT': case 'INTEGER': $pattern = '/[-+]?[0-9]+/'; if (is_array($source)) { $result = array(); // Itterate through the array foreach ($source as $eachString) { preg_match($pattern, (string) $eachString, $matches); $result[] = isset($matches[0]) ? (int) $matches[0] : 0; } } else { preg_match($pattern, (string) $source, $matches); $result = isset($matches[0]) ? (int) $matches[0] : 0; } break; case 'UINT': $pattern = '/[-+]?[0-9]+/'; if (is_array($source)) { $result = array(); // Itterate through the array foreach ($source as $eachString) { preg_match($pattern, (string) $eachString, $matches); $result[] = isset($matches[0]) ? abs((int) $matches[0]) : 0; } } else { preg_match($pattern, (string) $source, $matches); $result = isset($matches[0]) ? abs((int) $matches[0]) : 0; } break; case 'FLOAT': case 'DOUBLE': $pattern = '/[-+]?[0-9]+(\\.[0-9]+)?([eE][-+]?[0-9]+)?/'; if (is_array($source)) { $result = array(); // Itterate through the array foreach ($source as $eachString) { preg_match($pattern, (string) $eachString, $matches); $result[] = isset($matches[0]) ? (double) $matches[0] : 0; } } else { preg_match($pattern, (string) $source, $matches); $result = isset($matches[0]) ? (double) $matches[0] : 0; } break; case 'BOOL': case 'BOOLEAN': if (is_array($source)) { $result = array(); // Iterate through the array foreach ($source as $eachString) { $result[] = (bool) $eachString; } } else { $result = (bool) $source; } break; case 'WORD': $pattern = '/[^A-Z_]/i'; if (is_array($source)) { $result = array(); // Iterate through the array foreach ($source as $eachString) { $result[] = (string) preg_replace($pattern, '', $eachString); } } else { $result = (string) preg_replace($pattern, '', $source); } break; case 'ALNUM': $pattern = '/[^A-Z0-9]/i'; if (is_array($source)) { $result = array(); // Iterate through the array foreach ($source as $eachString) { $result[] = (string) preg_replace($pattern, '', $eachString); } } else { $result = (string) preg_replace($pattern, '', $source); } break; case 'CMD': $pattern = '/[^A-Z0-9_\\.-]/i'; if (is_array($source)) { $result = array(); // Iterate through the array foreach ($source as $eachString) { $cleaned = (string) preg_replace($pattern, '', $eachString); $result[] = ltrim($cleaned, '.'); } } else { $result = (string) preg_replace($pattern, '', $source); $result = ltrim($result, '.'); } break; case 'BASE64': $pattern = '/[^A-Z0-9\\/+=]/i'; if (is_array($source)) { $result = array(); // Iterate through the array foreach ($source as $eachString) { $result[] = (string) preg_replace($pattern, '', $eachString); } } else { $result = (string) preg_replace($pattern, '', $source); } break; case 'STRING': if (is_array($source)) { $result = array(); // Iterate through the array foreach ($source as $eachString) { $result[] = (string) $this->remove($this->decode((string) $eachString)); } } else { $result = (string) $this->remove($this->decode((string) $source)); } break; case 'HTML': if (is_array($source)) { $result = array(); // Iterate through the array foreach ($source as $eachString) { $result[] = (string) $this->remove((string) $eachString); } } else { $result = (string) $this->remove((string) $source); } break; case 'ARRAY': $result = (array) $source; break; case 'PATH': $pattern = '/^[A-Za-z0-9_\\/-]+[A-Za-z0-9_\\.-]*([\\\\\\/][A-Za-z0-9_-]+[A-Za-z0-9_\\.-]*)*$/'; if (is_array($source)) { $result = array(); // Itterate through the array foreach ($source as $eachString) { preg_match($pattern, (string) $eachString, $matches); $result[] = isset($matches[0]) ? (string) $matches[0] : ''; } } else { preg_match($pattern, $source, $matches); $result = isset($matches[0]) ? (string) $matches[0] : ''; } break; case 'TRIM': if (is_array($source)) { $result = array(); // Iterate through the array foreach ($source as $eachString) { $cleaned = (string) trim($eachString); $cleaned = StringHelper::trim($cleaned, chr(0xe3) . chr(0x80) . chr(0x80)); $result[] = StringHelper::trim($cleaned, chr(0xc2) . chr(0xa0)); } } else { $result = (string) trim($source); $result = StringHelper::trim($result, chr(0xe3) . chr(0x80) . chr(0x80)); $result = StringHelper::trim($result, chr(0xc2) . chr(0xa0)); } break; case 'USERNAME': $pattern = '/[\\x00-\\x1F\\x7F<>"\'%&]/'; if (is_array($source)) { $result = array(); // Iterate through the array foreach ($source as $eachString) { $result[] = (string) preg_replace($pattern, '', $eachString); } } else { $result = (string) preg_replace($pattern, '', $source); } break; case 'RAW': $result = $source; break; default: // Are we dealing with an array? if (is_array($source)) { foreach ($source as $key => $value) { // Filter element for XSS and other 'bad' code etc. if (is_string($value)) { $source[$key] = $this->_remove($this->_decode($value)); } } $result = $source; } else { // Or a string? if (is_string($source) && !empty($source)) { // Filter source for XSS and other 'bad' code etc. $result = $this->_remove($this->_decode($source)); } else { // Not an array or string... return the passed parameter $result = $source; } } break; } return $result; }
/** * Method to get the base word of a token. This method uses the public * {@link FinderIndexerHelper::$stemmer} object if it is set. If no stemmer is set, * the original token is returned. * * @param string $token The token to stem. * @param string $lang The language of the token. * * @return string The root token. * * @since 2.5 */ public static function stem($token, $lang) { // Trim apostrophes at either end of the token. $token = StringHelper::trim($token, '\''); // Trim everything after any apostrophe in the token. if (($pos = StringHelper::strpos($token, '\'')) !== false) { $token = StringHelper::substr($token, 0, $pos); } // Stem the token if we have a valid stemmer to use. if (static::$stemmer instanceof FinderIndexerStemmer) { return static::$stemmer->stem($token, $lang); } return $token; }
/** * Method to process the query input string and extract required, optional, * and excluded tokens; taxonomy filters; and date filters. * * @param string $input The query input string. * @param string $lang The query input language. * @param string $mode The query matching mode. * * @return boolean True on success. * * @since 2.5 * @throws Exception on database error. */ protected function processString($input, $lang, $mode) { // Clean up the input string. $input = html_entity_decode($input, ENT_QUOTES, 'UTF-8'); $input = StringHelper::strtolower($input); $input = preg_replace('#\\s+#mi', ' ', $input); $input = StringHelper::trim($input); $debug = JFactory::getConfig()->get('debug_lang'); /* * First, we need to handle string based modifiers. String based * modifiers could potentially include things like "category:blah" or * "before:2009-10-21" or "type:article", etc. */ $patterns = array('before' => JText::_('COM_FINDER_FILTER_WHEN_BEFORE'), 'after' => JText::_('COM_FINDER_FILTER_WHEN_AFTER')); // Add the taxonomy branch titles to the possible patterns. foreach (FinderIndexerTaxonomy::getBranchTitles() as $branch) { // Add the pattern. $patterns[$branch] = StringHelper::strtolower(JText::_(FinderHelperLanguage::branchSingular($branch))); } // Container for search terms and phrases. $terms = array(); $phrases = array(); // Cleared filter branches. $cleared = array(); /* * Compile the suffix pattern. This is used to match the values of the * filter input string. Single words can be input directly, multi-word * values have to be wrapped in double quotes. */ $quotes = html_entity_decode('‘’'', ENT_QUOTES, 'UTF-8'); $suffix = '(([\\w\\d' . $quotes . '-]+)|\\"([\\w\\d\\s' . $quotes . '-]+)\\")'; /* * Iterate through the possible filter patterns and search for matches. * We need to match the key, colon, and a value pattern for the match * to be valid. */ foreach ($patterns as $modifier => $pattern) { $matches = array(); if ($debug) { $pattern = substr($pattern, 2, -2); } // Check if the filter pattern is in the input string. if (preg_match('#' . $pattern . '\\s*:\\s*' . $suffix . '#mi', $input, $matches)) { // Get the value given to the modifier. $value = isset($matches[3]) ? $matches[3] : $matches[1]; // Now we have to handle the filter string. switch ($modifier) { // Handle a before and after date filters. case 'before': case 'after': // Get the time offset. $offset = JFactory::getApplication()->get('offset'); // Array of allowed when values. $whens = array('before', 'after', 'exact'); // The value of 'today' is a special case that we need to handle. if ($value === StringHelper::strtolower(JText::_('COM_FINDER_QUERY_FILTER_TODAY'))) { $value = JFactory::getDate('now', $offset)->format('%Y-%m-%d'); } // Try to parse the date string. $date = JFactory::getDate($value, $offset); // Check if the date was parsed successfully. if ($date->toUnix() !== null) { // Set the date filter. $this->date1 = $date->toSql(); $this->when1 = in_array($modifier, $whens) ? $modifier : 'before'; } break; // Handle a taxonomy branch filter. // Handle a taxonomy branch filter. default: // Try to find the node id. $return = FinderIndexerTaxonomy::getNodeByTitle($modifier, $value); // Check if the node id was found. if ($return) { // Check if the branch has been cleared. if (!in_array($modifier, $cleared)) { // Clear the branch. $this->filters[$modifier] = array(); // Add the branch to the cleared list. $cleared[] = $modifier; } // Add the filter to the list. $this->filters[$modifier][$return->title] = (int) $return->id; } break; } // Clean up the input string again. $input = str_replace($matches[0], '', $input); $input = preg_replace('#\\s+#mi', ' ', $input); $input = StringHelper::trim($input); } } /* * Extract the tokens enclosed in double quotes so that we can handle * them as phrases. */ if (StringHelper::strpos($input, '"') !== false) { $matches = array(); // Extract the tokens enclosed in double quotes. if (preg_match_all('#\\"([^"]+)\\"#mi', $input, $matches)) { /* * One or more phrases were found so we need to iterate through * them, tokenize them as phrases, and remove them from the raw * input string before we move on to the next processing step. */ foreach ($matches[1] as $key => $match) { // Find the complete phrase in the input string. $pos = StringHelper::strpos($input, $matches[0][$key]); $len = StringHelper::strlen($matches[0][$key]); // Add any terms that are before this phrase to the stack. if (StringHelper::trim(StringHelper::substr($input, 0, $pos))) { $terms = array_merge($terms, explode(' ', StringHelper::trim(StringHelper::substr($input, 0, $pos)))); } // Strip out everything up to and including the phrase. $input = StringHelper::substr($input, $pos + $len); // Clean up the input string again. $input = preg_replace('#\\s+#mi', ' ', $input); $input = StringHelper::trim($input); // Get the number of words in the phrase. $parts = explode(' ', $match); // Check if the phrase is longer than three words. if (count($parts) > 3) { /* * If the phrase is longer than three words, we need to * break it down into smaller chunks of phrases that * are less than or equal to three words. We overlap * the chunks so that we can ensure that a match is * found for the complete phrase and not just portions * of it. */ for ($i = 0, $c = count($parts); $i < $c; $i += 2) { // Set up the chunk. $chunk = array(); // The chunk has to be assembled based on how many // pieces are available to use. switch ($c - $i) { /* * If only one word is left, we can break from * the switch and loop because the last word * was already used at the end of the last * chunk. */ case 1: break 2; // If there words are left, we use them both as // the last chunk of the phrase and we're done. // If there words are left, we use them both as // the last chunk of the phrase and we're done. case 2: $chunk[] = $parts[$i]; $chunk[] = $parts[$i + 1]; break; // If there are three or more words left, we // build a three word chunk and continue on. // If there are three or more words left, we // build a three word chunk and continue on. default: $chunk[] = $parts[$i]; $chunk[] = $parts[$i + 1]; $chunk[] = $parts[$i + 2]; break; } // If the chunk is not empty, add it as a phrase. if (count($chunk)) { $phrases[] = implode(' ', $chunk); $terms[] = implode(' ', $chunk); } } } else { // The phrase is <= 3 words so we can use it as is. $phrases[] = $match; $terms[] = $match; } } } } // Add the remaining terms if present. if (!empty($input)) { $terms = array_merge($terms, explode(' ', $input)); } // An array of our boolean operators. $operator => $translation $operators = array('AND' => StringHelper::strtolower(JText::_('COM_FINDER_QUERY_OPERATOR_AND')), 'OR' => StringHelper::strtolower(JText::_('COM_FINDER_QUERY_OPERATOR_OR')), 'NOT' => StringHelper::strtolower(JText::_('COM_FINDER_QUERY_OPERATOR_NOT'))); // If language debugging is enabled you need to ignore the debug strings in matching. if (JDEBUG) { $debugStrings = array('**', '??'); $operators = str_replace($debugStrings, '', $operators); } /* * Iterate through the terms and perform any sorting that needs to be * done based on boolean search operators. Terms that are before an * and/or/not modifier have to be handled in relation to their operator. */ for ($i = 0, $c = count($terms); $i < $c; $i++) { // Check if the term is followed by an operator that we understand. if (isset($terms[$i + 1]) && in_array($terms[$i + 1], $operators)) { // Get the operator mode. $op = array_search($terms[$i + 1], $operators); // Handle the AND operator. if ($op === 'AND' && isset($terms[$i + 2])) { // Tokenize the current term. $token = FinderIndexerHelper::tokenize($terms[$i], $lang, true); $token = $this->getTokenData($token); // Set the required flag. $token->required = true; // Add the current token to the stack. $this->included[] = $token; $this->highlight = array_merge($this->highlight, array_keys($token->matches)); // Skip the next token (the mode operator). $this->operators[] = $terms[$i + 1]; // Tokenize the term after the next term (current plus two). $other = FinderIndexerHelper::tokenize($terms[$i + 2], $lang, true); $other = $this->getTokenData($other); // Set the required flag. $other->required = true; // Add the token after the next token to the stack. $this->included[] = $other; $this->highlight = array_merge($this->highlight, array_keys($other->matches)); // Remove the processed phrases if possible. if (($pk = array_search($terms[$i], $phrases)) !== false) { unset($phrases[$pk]); } if (($pk = array_search($terms[$i + 2], $phrases)) !== false) { unset($phrases[$pk]); } // Remove the processed terms. unset($terms[$i]); unset($terms[$i + 1]); unset($terms[$i + 2]); // Adjust the loop. $i += 2; continue; } elseif ($op === 'OR' && isset($terms[$i + 2])) { // Tokenize the current term. $token = FinderIndexerHelper::tokenize($terms[$i], $lang, true); $token = $this->getTokenData($token); // Set the required flag. $token->required = false; // Add the current token to the stack. if (count($token->matches)) { $this->included[] = $token; $this->highlight = array_merge($this->highlight, array_keys($token->matches)); } else { $this->ignored[] = $token; } // Skip the next token (the mode operator). $this->operators[] = $terms[$i + 1]; // Tokenize the term after the next term (current plus two). $other = FinderIndexerHelper::tokenize($terms[$i + 2], $lang, true); $other = $this->getTokenData($other); // Set the required flag. $other->required = false; // Add the token after the next token to the stack. if (count($other->matches)) { $this->included[] = $other; $this->highlight = array_merge($this->highlight, array_keys($other->matches)); } else { $this->ignored[] = $other; } // Remove the processed phrases if possible. if (($pk = array_search($terms[$i], $phrases)) !== false) { unset($phrases[$pk]); } if (($pk = array_search($terms[$i + 2], $phrases)) !== false) { unset($phrases[$pk]); } // Remove the processed terms. unset($terms[$i]); unset($terms[$i + 1]); unset($terms[$i + 2]); // Adjust the loop. $i += 2; continue; } } elseif (isset($terms[$i + 1]) && array_search($terms[$i], $operators) === 'OR') { // Skip the next token (the mode operator). $this->operators[] = $terms[$i]; // Tokenize the next term (current plus one). $other = FinderIndexerHelper::tokenize($terms[$i + 1], $lang, true); $other = $this->getTokenData($other); // Set the required flag. $other->required = false; // Add the token after the next token to the stack. if (count($other->matches)) { $this->included[] = $other; $this->highlight = array_merge($this->highlight, array_keys($other->matches)); } else { $this->ignored[] = $other; } // Remove the processed phrase if possible. if (($pk = array_search($terms[$i + 1], $phrases)) !== false) { unset($phrases[$pk]); } // Remove the processed terms. unset($terms[$i]); unset($terms[$i + 1]); // Adjust the loop. $i++; continue; } elseif (isset($terms[$i + 1]) && array_search($terms[$i], $operators) === 'NOT') { // Skip the next token (the mode operator). $this->operators[] = $terms[$i]; // Tokenize the next term (current plus one). $other = FinderIndexerHelper::tokenize($terms[$i + 1], $lang, true); $other = $this->getTokenData($other); // Set the required flag. $other->required = false; // Add the next token to the stack. if (count($other->matches)) { $this->excluded[] = $other; } else { $this->ignored[] = $other; } // Remove the processed phrase if possible. if (($pk = array_search($terms[$i + 1], $phrases)) !== false) { unset($phrases[$pk]); } // Remove the processed terms. unset($terms[$i]); unset($terms[$i + 1]); // Adjust the loop. $i++; continue; } } /* * Iterate through any search phrases and tokenize them. We handle * phrases as autonomous units and do not break them down into two and * three word combinations. */ for ($i = 0, $c = count($phrases); $i < $c; $i++) { // Tokenize the phrase. $token = FinderIndexerHelper::tokenize($phrases[$i], $lang, true); $token = $this->getTokenData($token); // Set the required flag. $token->required = true; // Add the current token to the stack. $this->included[] = $token; $this->highlight = array_merge($this->highlight, array_keys($token->matches)); // Remove the processed term if possible. if (($pk = array_search($phrases[$i], $terms)) !== false) { unset($terms[$pk]); } // Remove the processed phrase. unset($phrases[$i]); } /* * Handle any remaining tokens using the standard processing mechanism. */ if (!empty($terms)) { // Tokenize the terms. $terms = implode(' ', $terms); $tokens = FinderIndexerHelper::tokenize($terms, $lang, false); // Make sure we are working with an array. $tokens = is_array($tokens) ? $tokens : array($tokens); // Get the token data and required state for all the tokens. foreach ($tokens as $token) { // Get the token data. $token = $this->getTokenData($token); // Set the required flag for the token. $token->required = $mode === 'AND' ? $token->phrase ? false : true : false; // Add the token to the appropriate stack. if (count($token->matches) || $token->required) { $this->included[] = $token; $this->highlight = array_merge($this->highlight, array_keys($token->matches)); } else { $this->ignored[] = $token; } } } return true; }
/** * Truncates text blocks over the specified character limit and closes * all open HTML tags. The method will optionally not truncate an individual * word, it will find the first space that is within the limit and * truncate at that point. This method is UTF-8 safe. * * @param string $text The text to truncate. * @param integer $length The maximum length of the text. * @param boolean $noSplit Don't split a word if that is where the cutoff occurs (default: true). * @param boolean $allowHtml Allow HTML tags in the output, and close any open tags (default: true). * * @return string The truncated text. * * @since 1.6 */ public static function truncate($text, $length = 0, $noSplit = true, $allowHtml = true) { // Assume a lone open tag is invalid HTML. if ($length == 1 && substr($text, 0, 1) == '<') { return '...'; } // Check if HTML tags are allowed. if (!$allowHtml) { // Deal with spacing issues in the input. $text = str_replace('>', '> ', $text); $text = str_replace(array(' ', ' '), ' ', $text); $text = StringHelper::trim(preg_replace('#\\s+#mui', ' ', $text)); // Strip the tags from the input and decode entities. $text = strip_tags($text); $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8'); // Remove remaining extra spaces. $text = str_replace(' ', ' ', $text); $text = StringHelper::trim(preg_replace('#\\s+#mui', ' ', $text)); } // Whether or not allowing HTML, truncate the item text if it is too long. if ($length > 0 && StringHelper::strlen($text) > $length) { $tmp = trim(StringHelper::substr($text, 0, $length)); if (substr($tmp, 0, 1) == '<' && strpos($tmp, '>') === false) { return '...'; } // $noSplit true means that we do not allow splitting of words. if ($noSplit) { // Find the position of the last space within the allowed length. $offset = StringHelper::strrpos($tmp, ' '); $tmp = StringHelper::substr($tmp, 0, $offset + 1); // If there are no spaces and the string is longer than the maximum // we need to just use the ellipsis. In that case we are done. if ($offset === false && strlen($text) > $length) { return '...'; } if (StringHelper::strlen($tmp) > $length - 3) { $tmp = trim(StringHelper::substr($tmp, 0, StringHelper::strrpos($tmp, ' '))); } } if ($allowHtml) { // Put all opened tags into an array preg_match_all("#<([a-z][a-z0-9]*)\\b.*?(?!/)>#i", $tmp, $result); $openedTags = $result[1]; // Some tags self close so they do not need a separate close tag. $openedTags = array_diff($openedTags, array("img", "hr", "br")); $openedTags = array_values($openedTags); // Put all closed tags into an array preg_match_all("#</([a-z][a-z0-9]*)\\b(?:[^>]*?)>#iU", $tmp, $result); $closedTags = $result[1]; $numOpened = count($openedTags); // All tags are closed so trim the text and finish. if (count($closedTags) == $numOpened) { return trim($tmp) . '...'; } // Closing tags need to be in the reverse order of opening tags. $openedTags = array_reverse($openedTags); // Close tags for ($i = 0; $i < $numOpened; $i++) { if (!in_array($openedTags[$i], $closedTags)) { $tmp .= "</" . $openedTags[$i] . ">"; } else { unset($closedTags[array_search($openedTags[$i], $closedTags)]); } } } if ($tmp === false || strlen($text) > strlen($tmp)) { $text = trim($tmp) . '...'; } } // Clean up any internal spaces created by the processing. $text = str_replace(' </', '</', $text); $text = str_replace(' ...', '...', $text); return $text; }
/** * Method to parse input, tokenize it, and then add it to the database. * * @param mixed $input String or resource to use as input. A resource input will automatically be chunked to conserve * memory. Strings will be chunked if longer than 2K in size. * @param integer $context The context of the input. See context constants. * @param string $lang The language of the input. * @param string $format The format of the input. * * @return integer The number of tokens extracted from the input. * * @since 2.5 */ protected function tokenizeToDb($input, $context, $lang, $format) { $count = 0; $buffer = null; if (!empty($input)) { // If the input is a resource, batch the process out. if (is_resource($input)) { // Batch the process out to avoid memory limits. while (!feof($input)) { // Read into the buffer. $buffer .= fread($input, 2048); /* * If we haven't reached the end of the file, seek to the last * space character and drop whatever is after that to make sure * we didn't truncate a term while reading the input. */ if (!feof($input)) { // Find the last space character. $ls = strrpos($buffer, ' '); // Adjust string based on the last space character. if ($ls) { // Truncate the string to the last space character. $string = substr($buffer, 0, $ls); // Adjust the buffer based on the last space for the next iteration and trim. $buffer = StringHelper::trim(substr($buffer, $ls)); } else { $string = $buffer; } } else { $string = $buffer; } // Parse the input. $string = FinderIndexerHelper::parse($string, $format); // Check the input. if (empty($string)) { continue; } // Tokenize the input. $tokens = FinderIndexerHelper::tokenize($string, $lang); // Add the tokens to the database. $count += $this->addTokensToDb($tokens, $context); // Check if we're approaching the memory limit of the token table. if ($count > static::$state->options->get('memory_table_limit', 30000)) { $this->toggleTables(false); } unset($string); unset($tokens); } } elseif (strlen($input) > 2048) { $start = 0; $end = strlen($input); $chunk = 2048; /* * As it turns out, the complex regular expressions we use for * sanitizing input are not very efficient when given large * strings. It is much faster to process lots of short strings. */ while ($start < $end) { // Setup the string. $string = substr($input, $start, $chunk); // Find the last space character if we aren't at the end. $ls = $start + $chunk < $end ? strrpos($string, ' ') : false; // Truncate to the last space character. if ($ls !== false) { $string = substr($string, 0, $ls); } // Adjust the start position for the next iteration. $start += $ls !== false ? $ls + 1 - $chunk + $chunk : $chunk; // Parse the input. $string = FinderIndexerHelper::parse($string, $format); // Check the input. if (empty($string)) { continue; } // Tokenize the input. $tokens = FinderIndexerHelper::tokenize($string, $lang); // Add the tokens to the database. $count += $this->addTokensToDb($tokens, $context); // Check if we're approaching the memory limit of the token table. if ($count > static::$state->options->get('memory_table_limit', 30000)) { $this->toggleTables(false); } } } else { // Parse the input. $input = FinderIndexerHelper::parse($input, $format); // Check the input. if (empty($input)) { return $count; } // Tokenize the input. $tokens = FinderIndexerHelper::tokenize($input, $lang); // Add the tokens to the database. $count = $this->addTokensToDb($tokens, $context); } } return $count; }
/** * Import states that are based on not official states data, * and which are not connected to locations IDs. * * @param SimpleXMLElement $content * * @todo remove this in next major version. */ protected function importUnofficialStates($content) { if (!empty($content)) { $states = array(); $db = $this->getDbo(); foreach ($content->city as $item) { // Check for missing ascii characters title $name = StringHelper::trim($item->name); if (!$name) { continue; } $code = StringHelper::trim($item->state_code); $states[$code][] = '(' . $db->quoteName('name') . '=' . $db->quote($name) . ' AND ' . $db->quoteName('country_code') . '=' . $db->quote('US') . ')'; } foreach ($states as $stateCode => $cities) { $query = $db->getQuery(true); $query->update('#__crowdf_locations')->set($db->quoteName('state_code') . ' = ' . $db->quote($stateCode))->where(implode(' OR ', $cities)); $db->setQuery($query); $db->execute(); } unset($states, $content); } }