示例#1
0
 /**
  * Method to be called by another php script. Processes for XSS and
  * specified bad code.
  *
  * @param   mixed   $source  Input string/array-of-string to be 'cleaned'
  * @param   string  $type    The return type for the variable:
  *                           INT:       An integer, or an array of integers,
  *                           UINT:      An unsigned integer, or an array of unsigned integers,
  *                           FLOAT:     A floating point number, or an array of floating point numbers,
  *                           BOOLEAN:   A boolean value,
  *                           WORD:      A string containing A-Z or underscores only (not case sensitive),
  *                           ALNUM:     A string containing A-Z or 0-9 only (not case sensitive),
  *                           CMD:       A string containing A-Z, 0-9, underscores, periods or hyphens (not case sensitive),
  *                           BASE64:    A string containing A-Z, 0-9, forward slashes, plus or equals (not case sensitive),
  *                           STRING:    A fully decoded and sanitised string (default),
  *                           HTML:      A sanitised string,
  *                           ARRAY:     An array,
  *                           PATH:      A sanitised file path, or an array of sanitised file paths,
  *                           TRIM:      A string trimmed from normal, non-breaking and multibyte spaces
  *                           USERNAME:  Do not use (use an application specific filter),
  *                           RAW:       The raw string is returned with no filtering,
  *                           unknown:   An unknown filter will act like STRING. If the input is an array it will return an
  *                                      array of fully decoded and sanitised strings.
  *
  * @return  mixed  'Cleaned' version of input parameter
  *
  * @since   11.1
  */
 public function clean($source, $type = 'string')
 {
     // Strip Unicode Supplementary Characters when requested to do so
     if ($this->stripUSC) {
         // Alternatively: preg_replace('/[\x{10000}-\x{10FFFF}]/u', "\xE2\xAF\x91", $source) but it'd be slower.
         $source = $this->stripUSC($source);
     }
     // Handle the type constraint cases
     switch (strtoupper($type)) {
         case 'INT':
         case 'INTEGER':
             $pattern = '/[-+]?[0-9]+/';
             if (is_array($source)) {
                 $result = array();
                 // Itterate through the array
                 foreach ($source as $eachString) {
                     preg_match($pattern, (string) $eachString, $matches);
                     $result[] = isset($matches[0]) ? (int) $matches[0] : 0;
                 }
             } else {
                 preg_match($pattern, (string) $source, $matches);
                 $result = isset($matches[0]) ? (int) $matches[0] : 0;
             }
             break;
         case 'UINT':
             $pattern = '/[-+]?[0-9]+/';
             if (is_array($source)) {
                 $result = array();
                 // Itterate through the array
                 foreach ($source as $eachString) {
                     preg_match($pattern, (string) $eachString, $matches);
                     $result[] = isset($matches[0]) ? abs((int) $matches[0]) : 0;
                 }
             } else {
                 preg_match($pattern, (string) $source, $matches);
                 $result = isset($matches[0]) ? abs((int) $matches[0]) : 0;
             }
             break;
         case 'FLOAT':
         case 'DOUBLE':
             $pattern = '/[-+]?[0-9]+(\\.[0-9]+)?([eE][-+]?[0-9]+)?/';
             if (is_array($source)) {
                 $result = array();
                 // Itterate through the array
                 foreach ($source as $eachString) {
                     preg_match($pattern, (string) $eachString, $matches);
                     $result[] = isset($matches[0]) ? (double) $matches[0] : 0;
                 }
             } else {
                 preg_match($pattern, (string) $source, $matches);
                 $result = isset($matches[0]) ? (double) $matches[0] : 0;
             }
             break;
         case 'BOOL':
         case 'BOOLEAN':
             if (is_array($source)) {
                 $result = array();
                 // Iterate through the array
                 foreach ($source as $eachString) {
                     $result[] = (bool) $eachString;
                 }
             } else {
                 $result = (bool) $source;
             }
             break;
         case 'WORD':
             $pattern = '/[^A-Z_]/i';
             if (is_array($source)) {
                 $result = array();
                 // Iterate through the array
                 foreach ($source as $eachString) {
                     $result[] = (string) preg_replace($pattern, '', $eachString);
                 }
             } else {
                 $result = (string) preg_replace($pattern, '', $source);
             }
             break;
         case 'ALNUM':
             $pattern = '/[^A-Z0-9]/i';
             if (is_array($source)) {
                 $result = array();
                 // Iterate through the array
                 foreach ($source as $eachString) {
                     $result[] = (string) preg_replace($pattern, '', $eachString);
                 }
             } else {
                 $result = (string) preg_replace($pattern, '', $source);
             }
             break;
         case 'CMD':
             $pattern = '/[^A-Z0-9_\\.-]/i';
             if (is_array($source)) {
                 $result = array();
                 // Iterate through the array
                 foreach ($source as $eachString) {
                     $cleaned = (string) preg_replace($pattern, '', $eachString);
                     $result[] = ltrim($cleaned, '.');
                 }
             } else {
                 $result = (string) preg_replace($pattern, '', $source);
                 $result = ltrim($result, '.');
             }
             break;
         case 'BASE64':
             $pattern = '/[^A-Z0-9\\/+=]/i';
             if (is_array($source)) {
                 $result = array();
                 // Iterate through the array
                 foreach ($source as $eachString) {
                     $result[] = (string) preg_replace($pattern, '', $eachString);
                 }
             } else {
                 $result = (string) preg_replace($pattern, '', $source);
             }
             break;
         case 'STRING':
             if (is_array($source)) {
                 $result = array();
                 // Iterate through the array
                 foreach ($source as $eachString) {
                     $result[] = (string) $this->remove($this->decode((string) $eachString));
                 }
             } else {
                 $result = (string) $this->remove($this->decode((string) $source));
             }
             break;
         case 'HTML':
             if (is_array($source)) {
                 $result = array();
                 // Iterate through the array
                 foreach ($source as $eachString) {
                     $result[] = (string) $this->remove((string) $eachString);
                 }
             } else {
                 $result = (string) $this->remove((string) $source);
             }
             break;
         case 'ARRAY':
             $result = (array) $source;
             break;
         case 'PATH':
             $pattern = '/^[A-Za-z0-9_\\/-]+[A-Za-z0-9_\\.-]*([\\\\\\/][A-Za-z0-9_-]+[A-Za-z0-9_\\.-]*)*$/';
             if (is_array($source)) {
                 $result = array();
                 // Itterate through the array
                 foreach ($source as $eachString) {
                     preg_match($pattern, (string) $eachString, $matches);
                     $result[] = isset($matches[0]) ? (string) $matches[0] : '';
                 }
             } else {
                 preg_match($pattern, $source, $matches);
                 $result = isset($matches[0]) ? (string) $matches[0] : '';
             }
             break;
         case 'TRIM':
             if (is_array($source)) {
                 $result = array();
                 // Iterate through the array
                 foreach ($source as $eachString) {
                     $cleaned = (string) trim($eachString);
                     $cleaned = StringHelper::trim($cleaned, chr(0xe3) . chr(0x80) . chr(0x80));
                     $result[] = StringHelper::trim($cleaned, chr(0xc2) . chr(0xa0));
                 }
             } else {
                 $result = (string) trim($source);
                 $result = StringHelper::trim($result, chr(0xe3) . chr(0x80) . chr(0x80));
                 $result = StringHelper::trim($result, chr(0xc2) . chr(0xa0));
             }
             break;
         case 'USERNAME':
             $pattern = '/[\\x00-\\x1F\\x7F<>"\'%&]/';
             if (is_array($source)) {
                 $result = array();
                 // Iterate through the array
                 foreach ($source as $eachString) {
                     $result[] = (string) preg_replace($pattern, '', $eachString);
                 }
             } else {
                 $result = (string) preg_replace($pattern, '', $source);
             }
             break;
         case 'RAW':
             $result = $source;
             break;
         default:
             // Are we dealing with an array?
             if (is_array($source)) {
                 foreach ($source as $key => $value) {
                     // Filter element for XSS and other 'bad' code etc.
                     if (is_string($value)) {
                         $source[$key] = $this->_remove($this->_decode($value));
                     }
                 }
                 $result = $source;
             } else {
                 // Or a string?
                 if (is_string($source) && !empty($source)) {
                     // Filter source for XSS and other 'bad' code etc.
                     $result = $this->_remove($this->_decode($source));
                 } else {
                     // Not an array or string... return the passed parameter
                     $result = $source;
                 }
             }
             break;
     }
     return $result;
 }
示例#2
0
 /**
  * Method to get the base word of a token. This method uses the public
  * {@link FinderIndexerHelper::$stemmer} object if it is set. If no stemmer is set,
  * the original token is returned.
  *
  * @param   string  $token  The token to stem.
  * @param   string  $lang   The language of the token.
  *
  * @return  string  The root token.
  *
  * @since   2.5
  */
 public static function stem($token, $lang)
 {
     // Trim apostrophes at either end of the token.
     $token = StringHelper::trim($token, '\'');
     // Trim everything after any apostrophe in the token.
     if (($pos = StringHelper::strpos($token, '\'')) !== false) {
         $token = StringHelper::substr($token, 0, $pos);
     }
     // Stem the token if we have a valid stemmer to use.
     if (static::$stemmer instanceof FinderIndexerStemmer) {
         return static::$stemmer->stem($token, $lang);
     }
     return $token;
 }
示例#3
0
 /**
  * Method to process the query input string and extract required, optional,
  * and excluded tokens; taxonomy filters; and date filters.
  *
  * @param   string  $input  The query input string.
  * @param   string  $lang   The query input language.
  * @param   string  $mode   The query matching mode.
  *
  * @return  boolean  True on success.
  *
  * @since   2.5
  * @throws  Exception on database error.
  */
 protected function processString($input, $lang, $mode)
 {
     // Clean up the input string.
     $input = html_entity_decode($input, ENT_QUOTES, 'UTF-8');
     $input = StringHelper::strtolower($input);
     $input = preg_replace('#\\s+#mi', ' ', $input);
     $input = StringHelper::trim($input);
     $debug = JFactory::getConfig()->get('debug_lang');
     /*
      * First, we need to handle string based modifiers. String based
      * modifiers could potentially include things like "category:blah" or
      * "before:2009-10-21" or "type:article", etc.
      */
     $patterns = array('before' => JText::_('COM_FINDER_FILTER_WHEN_BEFORE'), 'after' => JText::_('COM_FINDER_FILTER_WHEN_AFTER'));
     // Add the taxonomy branch titles to the possible patterns.
     foreach (FinderIndexerTaxonomy::getBranchTitles() as $branch) {
         // Add the pattern.
         $patterns[$branch] = StringHelper::strtolower(JText::_(FinderHelperLanguage::branchSingular($branch)));
     }
     // Container for search terms and phrases.
     $terms = array();
     $phrases = array();
     // Cleared filter branches.
     $cleared = array();
     /*
      * Compile the suffix pattern. This is used to match the values of the
      * filter input string. Single words can be input directly, multi-word
      * values have to be wrapped in double quotes.
      */
     $quotes = html_entity_decode('&#8216;&#8217;&#39;', ENT_QUOTES, 'UTF-8');
     $suffix = '(([\\w\\d' . $quotes . '-]+)|\\"([\\w\\d\\s' . $quotes . '-]+)\\")';
     /*
      * Iterate through the possible filter patterns and search for matches.
      * We need to match the key, colon, and a value pattern for the match
      * to be valid.
      */
     foreach ($patterns as $modifier => $pattern) {
         $matches = array();
         if ($debug) {
             $pattern = substr($pattern, 2, -2);
         }
         // Check if the filter pattern is in the input string.
         if (preg_match('#' . $pattern . '\\s*:\\s*' . $suffix . '#mi', $input, $matches)) {
             // Get the value given to the modifier.
             $value = isset($matches[3]) ? $matches[3] : $matches[1];
             // Now we have to handle the filter string.
             switch ($modifier) {
                 // Handle a before and after date filters.
                 case 'before':
                 case 'after':
                     // Get the time offset.
                     $offset = JFactory::getApplication()->get('offset');
                     // Array of allowed when values.
                     $whens = array('before', 'after', 'exact');
                     // The value of 'today' is a special case that we need to handle.
                     if ($value === StringHelper::strtolower(JText::_('COM_FINDER_QUERY_FILTER_TODAY'))) {
                         $value = JFactory::getDate('now', $offset)->format('%Y-%m-%d');
                     }
                     // Try to parse the date string.
                     $date = JFactory::getDate($value, $offset);
                     // Check if the date was parsed successfully.
                     if ($date->toUnix() !== null) {
                         // Set the date filter.
                         $this->date1 = $date->toSql();
                         $this->when1 = in_array($modifier, $whens) ? $modifier : 'before';
                     }
                     break;
                     // Handle a taxonomy branch filter.
                 // Handle a taxonomy branch filter.
                 default:
                     // Try to find the node id.
                     $return = FinderIndexerTaxonomy::getNodeByTitle($modifier, $value);
                     // Check if the node id was found.
                     if ($return) {
                         // Check if the branch has been cleared.
                         if (!in_array($modifier, $cleared)) {
                             // Clear the branch.
                             $this->filters[$modifier] = array();
                             // Add the branch to the cleared list.
                             $cleared[] = $modifier;
                         }
                         // Add the filter to the list.
                         $this->filters[$modifier][$return->title] = (int) $return->id;
                     }
                     break;
             }
             // Clean up the input string again.
             $input = str_replace($matches[0], '', $input);
             $input = preg_replace('#\\s+#mi', ' ', $input);
             $input = StringHelper::trim($input);
         }
     }
     /*
      * Extract the tokens enclosed in double quotes so that we can handle
      * them as phrases.
      */
     if (StringHelper::strpos($input, '"') !== false) {
         $matches = array();
         // Extract the tokens enclosed in double quotes.
         if (preg_match_all('#\\"([^"]+)\\"#mi', $input, $matches)) {
             /*
              * One or more phrases were found so we need to iterate through
              * them, tokenize them as phrases, and remove them from the raw
              * input string before we move on to the next processing step.
              */
             foreach ($matches[1] as $key => $match) {
                 // Find the complete phrase in the input string.
                 $pos = StringHelper::strpos($input, $matches[0][$key]);
                 $len = StringHelper::strlen($matches[0][$key]);
                 // Add any terms that are before this phrase to the stack.
                 if (StringHelper::trim(StringHelper::substr($input, 0, $pos))) {
                     $terms = array_merge($terms, explode(' ', StringHelper::trim(StringHelper::substr($input, 0, $pos))));
                 }
                 // Strip out everything up to and including the phrase.
                 $input = StringHelper::substr($input, $pos + $len);
                 // Clean up the input string again.
                 $input = preg_replace('#\\s+#mi', ' ', $input);
                 $input = StringHelper::trim($input);
                 // Get the number of words in the phrase.
                 $parts = explode(' ', $match);
                 // Check if the phrase is longer than three words.
                 if (count($parts) > 3) {
                     /*
                      * If the phrase is longer than three words, we need to
                      * break it down into smaller chunks of phrases that
                      * are less than or equal to three words. We overlap
                      * the chunks so that we can ensure that a match is
                      * found for the complete phrase and not just portions
                      * of it.
                      */
                     for ($i = 0, $c = count($parts); $i < $c; $i += 2) {
                         // Set up the chunk.
                         $chunk = array();
                         // The chunk has to be assembled based on how many
                         // pieces are available to use.
                         switch ($c - $i) {
                             /*
                              * If only one word is left, we can break from
                              * the switch and loop because the last word
                              * was already used at the end of the last
                              * chunk.
                              */
                             case 1:
                                 break 2;
                                 // If there words are left, we use them both as
                                 // the last chunk of the phrase and we're done.
                             // If there words are left, we use them both as
                             // the last chunk of the phrase and we're done.
                             case 2:
                                 $chunk[] = $parts[$i];
                                 $chunk[] = $parts[$i + 1];
                                 break;
                                 // If there are three or more words left, we
                                 // build a three word chunk and continue on.
                             // If there are three or more words left, we
                             // build a three word chunk and continue on.
                             default:
                                 $chunk[] = $parts[$i];
                                 $chunk[] = $parts[$i + 1];
                                 $chunk[] = $parts[$i + 2];
                                 break;
                         }
                         // If the chunk is not empty, add it as a phrase.
                         if (count($chunk)) {
                             $phrases[] = implode(' ', $chunk);
                             $terms[] = implode(' ', $chunk);
                         }
                     }
                 } else {
                     // The phrase is <= 3 words so we can use it as is.
                     $phrases[] = $match;
                     $terms[] = $match;
                 }
             }
         }
     }
     // Add the remaining terms if present.
     if (!empty($input)) {
         $terms = array_merge($terms, explode(' ', $input));
     }
     // An array of our boolean operators. $operator => $translation
     $operators = array('AND' => StringHelper::strtolower(JText::_('COM_FINDER_QUERY_OPERATOR_AND')), 'OR' => StringHelper::strtolower(JText::_('COM_FINDER_QUERY_OPERATOR_OR')), 'NOT' => StringHelper::strtolower(JText::_('COM_FINDER_QUERY_OPERATOR_NOT')));
     // If language debugging is enabled you need to ignore the debug strings in matching.
     if (JDEBUG) {
         $debugStrings = array('**', '??');
         $operators = str_replace($debugStrings, '', $operators);
     }
     /*
      * Iterate through the terms and perform any sorting that needs to be
      * done based on boolean search operators. Terms that are before an
      * and/or/not modifier have to be handled in relation to their operator.
      */
     for ($i = 0, $c = count($terms); $i < $c; $i++) {
         // Check if the term is followed by an operator that we understand.
         if (isset($terms[$i + 1]) && in_array($terms[$i + 1], $operators)) {
             // Get the operator mode.
             $op = array_search($terms[$i + 1], $operators);
             // Handle the AND operator.
             if ($op === 'AND' && isset($terms[$i + 2])) {
                 // Tokenize the current term.
                 $token = FinderIndexerHelper::tokenize($terms[$i], $lang, true);
                 $token = $this->getTokenData($token);
                 // Set the required flag.
                 $token->required = true;
                 // Add the current token to the stack.
                 $this->included[] = $token;
                 $this->highlight = array_merge($this->highlight, array_keys($token->matches));
                 // Skip the next token (the mode operator).
                 $this->operators[] = $terms[$i + 1];
                 // Tokenize the term after the next term (current plus two).
                 $other = FinderIndexerHelper::tokenize($terms[$i + 2], $lang, true);
                 $other = $this->getTokenData($other);
                 // Set the required flag.
                 $other->required = true;
                 // Add the token after the next token to the stack.
                 $this->included[] = $other;
                 $this->highlight = array_merge($this->highlight, array_keys($other->matches));
                 // Remove the processed phrases if possible.
                 if (($pk = array_search($terms[$i], $phrases)) !== false) {
                     unset($phrases[$pk]);
                 }
                 if (($pk = array_search($terms[$i + 2], $phrases)) !== false) {
                     unset($phrases[$pk]);
                 }
                 // Remove the processed terms.
                 unset($terms[$i]);
                 unset($terms[$i + 1]);
                 unset($terms[$i + 2]);
                 // Adjust the loop.
                 $i += 2;
                 continue;
             } elseif ($op === 'OR' && isset($terms[$i + 2])) {
                 // Tokenize the current term.
                 $token = FinderIndexerHelper::tokenize($terms[$i], $lang, true);
                 $token = $this->getTokenData($token);
                 // Set the required flag.
                 $token->required = false;
                 // Add the current token to the stack.
                 if (count($token->matches)) {
                     $this->included[] = $token;
                     $this->highlight = array_merge($this->highlight, array_keys($token->matches));
                 } else {
                     $this->ignored[] = $token;
                 }
                 // Skip the next token (the mode operator).
                 $this->operators[] = $terms[$i + 1];
                 // Tokenize the term after the next term (current plus two).
                 $other = FinderIndexerHelper::tokenize($terms[$i + 2], $lang, true);
                 $other = $this->getTokenData($other);
                 // Set the required flag.
                 $other->required = false;
                 // Add the token after the next token to the stack.
                 if (count($other->matches)) {
                     $this->included[] = $other;
                     $this->highlight = array_merge($this->highlight, array_keys($other->matches));
                 } else {
                     $this->ignored[] = $other;
                 }
                 // Remove the processed phrases if possible.
                 if (($pk = array_search($terms[$i], $phrases)) !== false) {
                     unset($phrases[$pk]);
                 }
                 if (($pk = array_search($terms[$i + 2], $phrases)) !== false) {
                     unset($phrases[$pk]);
                 }
                 // Remove the processed terms.
                 unset($terms[$i]);
                 unset($terms[$i + 1]);
                 unset($terms[$i + 2]);
                 // Adjust the loop.
                 $i += 2;
                 continue;
             }
         } elseif (isset($terms[$i + 1]) && array_search($terms[$i], $operators) === 'OR') {
             // Skip the next token (the mode operator).
             $this->operators[] = $terms[$i];
             // Tokenize the next term (current plus one).
             $other = FinderIndexerHelper::tokenize($terms[$i + 1], $lang, true);
             $other = $this->getTokenData($other);
             // Set the required flag.
             $other->required = false;
             // Add the token after the next token to the stack.
             if (count($other->matches)) {
                 $this->included[] = $other;
                 $this->highlight = array_merge($this->highlight, array_keys($other->matches));
             } else {
                 $this->ignored[] = $other;
             }
             // Remove the processed phrase if possible.
             if (($pk = array_search($terms[$i + 1], $phrases)) !== false) {
                 unset($phrases[$pk]);
             }
             // Remove the processed terms.
             unset($terms[$i]);
             unset($terms[$i + 1]);
             // Adjust the loop.
             $i++;
             continue;
         } elseif (isset($terms[$i + 1]) && array_search($terms[$i], $operators) === 'NOT') {
             // Skip the next token (the mode operator).
             $this->operators[] = $terms[$i];
             // Tokenize the next term (current plus one).
             $other = FinderIndexerHelper::tokenize($terms[$i + 1], $lang, true);
             $other = $this->getTokenData($other);
             // Set the required flag.
             $other->required = false;
             // Add the next token to the stack.
             if (count($other->matches)) {
                 $this->excluded[] = $other;
             } else {
                 $this->ignored[] = $other;
             }
             // Remove the processed phrase if possible.
             if (($pk = array_search($terms[$i + 1], $phrases)) !== false) {
                 unset($phrases[$pk]);
             }
             // Remove the processed terms.
             unset($terms[$i]);
             unset($terms[$i + 1]);
             // Adjust the loop.
             $i++;
             continue;
         }
     }
     /*
      * Iterate through any search phrases and tokenize them. We handle
      * phrases as autonomous units and do not break them down into two and
      * three word combinations.
      */
     for ($i = 0, $c = count($phrases); $i < $c; $i++) {
         // Tokenize the phrase.
         $token = FinderIndexerHelper::tokenize($phrases[$i], $lang, true);
         $token = $this->getTokenData($token);
         // Set the required flag.
         $token->required = true;
         // Add the current token to the stack.
         $this->included[] = $token;
         $this->highlight = array_merge($this->highlight, array_keys($token->matches));
         // Remove the processed term if possible.
         if (($pk = array_search($phrases[$i], $terms)) !== false) {
             unset($terms[$pk]);
         }
         // Remove the processed phrase.
         unset($phrases[$i]);
     }
     /*
      * Handle any remaining tokens using the standard processing mechanism.
      */
     if (!empty($terms)) {
         // Tokenize the terms.
         $terms = implode(' ', $terms);
         $tokens = FinderIndexerHelper::tokenize($terms, $lang, false);
         // Make sure we are working with an array.
         $tokens = is_array($tokens) ? $tokens : array($tokens);
         // Get the token data and required state for all the tokens.
         foreach ($tokens as $token) {
             // Get the token data.
             $token = $this->getTokenData($token);
             // Set the required flag for the token.
             $token->required = $mode === 'AND' ? $token->phrase ? false : true : false;
             // Add the token to the appropriate stack.
             if (count($token->matches) || $token->required) {
                 $this->included[] = $token;
                 $this->highlight = array_merge($this->highlight, array_keys($token->matches));
             } else {
                 $this->ignored[] = $token;
             }
         }
     }
     return true;
 }
示例#4
0
 /**
  * Truncates text blocks over the specified character limit and closes
  * all open HTML tags. The method will optionally not truncate an individual
  * word, it will find the first space that is within the limit and
  * truncate at that point. This method is UTF-8 safe.
  *
  * @param   string   $text       The text to truncate.
  * @param   integer  $length     The maximum length of the text.
  * @param   boolean  $noSplit    Don't split a word if that is where the cutoff occurs (default: true).
  * @param   boolean  $allowHtml  Allow HTML tags in the output, and close any open tags (default: true).
  *
  * @return  string   The truncated text.
  *
  * @since   1.6
  */
 public static function truncate($text, $length = 0, $noSplit = true, $allowHtml = true)
 {
     // Assume a lone open tag is invalid HTML.
     if ($length == 1 && substr($text, 0, 1) == '<') {
         return '...';
     }
     // Check if HTML tags are allowed.
     if (!$allowHtml) {
         // Deal with spacing issues in the input.
         $text = str_replace('>', '> ', $text);
         $text = str_replace(array('&nbsp;', '&#160;'), ' ', $text);
         $text = StringHelper::trim(preg_replace('#\\s+#mui', ' ', $text));
         // Strip the tags from the input and decode entities.
         $text = strip_tags($text);
         $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
         // Remove remaining extra spaces.
         $text = str_replace('&nbsp;', ' ', $text);
         $text = StringHelper::trim(preg_replace('#\\s+#mui', ' ', $text));
     }
     // Whether or not allowing HTML, truncate the item text if it is too long.
     if ($length > 0 && StringHelper::strlen($text) > $length) {
         $tmp = trim(StringHelper::substr($text, 0, $length));
         if (substr($tmp, 0, 1) == '<' && strpos($tmp, '>') === false) {
             return '...';
         }
         // $noSplit true means that we do not allow splitting of words.
         if ($noSplit) {
             // Find the position of the last space within the allowed length.
             $offset = StringHelper::strrpos($tmp, ' ');
             $tmp = StringHelper::substr($tmp, 0, $offset + 1);
             // If there are no spaces and the string is longer than the maximum
             // we need to just use the ellipsis. In that case we are done.
             if ($offset === false && strlen($text) > $length) {
                 return '...';
             }
             if (StringHelper::strlen($tmp) > $length - 3) {
                 $tmp = trim(StringHelper::substr($tmp, 0, StringHelper::strrpos($tmp, ' ')));
             }
         }
         if ($allowHtml) {
             // Put all opened tags into an array
             preg_match_all("#<([a-z][a-z0-9]*)\\b.*?(?!/)>#i", $tmp, $result);
             $openedTags = $result[1];
             // Some tags self close so they do not need a separate close tag.
             $openedTags = array_diff($openedTags, array("img", "hr", "br"));
             $openedTags = array_values($openedTags);
             // Put all closed tags into an array
             preg_match_all("#</([a-z][a-z0-9]*)\\b(?:[^>]*?)>#iU", $tmp, $result);
             $closedTags = $result[1];
             $numOpened = count($openedTags);
             // All tags are closed so trim the text and finish.
             if (count($closedTags) == $numOpened) {
                 return trim($tmp) . '...';
             }
             // Closing tags need to be in the reverse order of opening tags.
             $openedTags = array_reverse($openedTags);
             // Close tags
             for ($i = 0; $i < $numOpened; $i++) {
                 if (!in_array($openedTags[$i], $closedTags)) {
                     $tmp .= "</" . $openedTags[$i] . ">";
                 } else {
                     unset($closedTags[array_search($openedTags[$i], $closedTags)]);
                 }
             }
         }
         if ($tmp === false || strlen($text) > strlen($tmp)) {
             $text = trim($tmp) . '...';
         }
     }
     // Clean up any internal spaces created by the processing.
     $text = str_replace(' </', '</', $text);
     $text = str_replace(' ...', '...', $text);
     return $text;
 }
示例#5
0
 /**
  * Method to parse input, tokenize it, and then add it to the database.
  *
  * @param   mixed    $input    String or resource to use as input. A resource input will automatically be chunked to conserve
  *                             memory. Strings will be chunked if longer than 2K in size.
  * @param   integer  $context  The context of the input. See context constants.
  * @param   string   $lang     The language of the input.
  * @param   string   $format   The format of the input.
  *
  * @return  integer  The number of tokens extracted from the input.
  *
  * @since   2.5
  */
 protected function tokenizeToDb($input, $context, $lang, $format)
 {
     $count = 0;
     $buffer = null;
     if (!empty($input)) {
         // If the input is a resource, batch the process out.
         if (is_resource($input)) {
             // Batch the process out to avoid memory limits.
             while (!feof($input)) {
                 // Read into the buffer.
                 $buffer .= fread($input, 2048);
                 /*
                  * If we haven't reached the end of the file, seek to the last
                  * space character and drop whatever is after that to make sure
                  * we didn't truncate a term while reading the input.
                  */
                 if (!feof($input)) {
                     // Find the last space character.
                     $ls = strrpos($buffer, ' ');
                     // Adjust string based on the last space character.
                     if ($ls) {
                         // Truncate the string to the last space character.
                         $string = substr($buffer, 0, $ls);
                         // Adjust the buffer based on the last space for the next iteration and trim.
                         $buffer = StringHelper::trim(substr($buffer, $ls));
                     } else {
                         $string = $buffer;
                     }
                 } else {
                     $string = $buffer;
                 }
                 // Parse the input.
                 $string = FinderIndexerHelper::parse($string, $format);
                 // Check the input.
                 if (empty($string)) {
                     continue;
                 }
                 // Tokenize the input.
                 $tokens = FinderIndexerHelper::tokenize($string, $lang);
                 // Add the tokens to the database.
                 $count += $this->addTokensToDb($tokens, $context);
                 // Check if we're approaching the memory limit of the token table.
                 if ($count > static::$state->options->get('memory_table_limit', 30000)) {
                     $this->toggleTables(false);
                 }
                 unset($string);
                 unset($tokens);
             }
         } elseif (strlen($input) > 2048) {
             $start = 0;
             $end = strlen($input);
             $chunk = 2048;
             /*
              * As it turns out, the complex regular expressions we use for
              * sanitizing input are not very efficient when given large
              * strings. It is much faster to process lots of short strings.
              */
             while ($start < $end) {
                 // Setup the string.
                 $string = substr($input, $start, $chunk);
                 // Find the last space character if we aren't at the end.
                 $ls = $start + $chunk < $end ? strrpos($string, ' ') : false;
                 // Truncate to the last space character.
                 if ($ls !== false) {
                     $string = substr($string, 0, $ls);
                 }
                 // Adjust the start position for the next iteration.
                 $start += $ls !== false ? $ls + 1 - $chunk + $chunk : $chunk;
                 // Parse the input.
                 $string = FinderIndexerHelper::parse($string, $format);
                 // Check the input.
                 if (empty($string)) {
                     continue;
                 }
                 // Tokenize the input.
                 $tokens = FinderIndexerHelper::tokenize($string, $lang);
                 // Add the tokens to the database.
                 $count += $this->addTokensToDb($tokens, $context);
                 // Check if we're approaching the memory limit of the token table.
                 if ($count > static::$state->options->get('memory_table_limit', 30000)) {
                     $this->toggleTables(false);
                 }
             }
         } else {
             // Parse the input.
             $input = FinderIndexerHelper::parse($input, $format);
             // Check the input.
             if (empty($input)) {
                 return $count;
             }
             // Tokenize the input.
             $tokens = FinderIndexerHelper::tokenize($input, $lang);
             // Add the tokens to the database.
             $count = $this->addTokensToDb($tokens, $context);
         }
     }
     return $count;
 }
 /**
  * Import states that are based on not official states data,
  * and which are not connected to locations IDs.
  *
  * @param SimpleXMLElement $content
  *
  * @todo remove this in next major version.
  */
 protected function importUnofficialStates($content)
 {
     if (!empty($content)) {
         $states = array();
         $db = $this->getDbo();
         foreach ($content->city as $item) {
             // Check for missing ascii characters title
             $name = StringHelper::trim($item->name);
             if (!$name) {
                 continue;
             }
             $code = StringHelper::trim($item->state_code);
             $states[$code][] = '(' . $db->quoteName('name') . '=' . $db->quote($name) . ' AND ' . $db->quoteName('country_code') . '=' . $db->quote('US') . ')';
         }
         foreach ($states as $stateCode => $cities) {
             $query = $db->getQuery(true);
             $query->update('#__crowdf_locations')->set($db->quoteName('state_code') . ' = ' . $db->quote($stateCode))->where(implode(' OR ', $cities));
             $db->setQuery($query);
             $db->execute();
         }
         unset($states, $content);
     }
 }