コード例 #1
0
ファイル: indexer.php プロジェクト: acculitx/fleetmatrixsite
 /**
  * Method to parse input, tokenize it, and then add it to the database.
  *
  * @param   mixed    $input    String or resource to use as input. A resource
  *                             input will automatically be chunked to conserve
  *                             memory. Strings will be chunked if longer than
  *                             2K in size.
  * @param   integer  $context  The context of the input. See context constants.
  * @param   string   $lang     The language of the input.
  * @param   string   $format   The format of the input.
  *
  * @return  integer  The number of tokens extracted from the input.
  *
  * @since   2.5
  */
 protected static function tokenizeToDB($input, $context, $lang, $format)
 {
     $count = 0;
     $buffer = null;
     // If the input is a resource, batch the process out.
     if (is_resource($input)) {
         // Batch the process out to avoid memory limits.
         while (!feof($input)) {
             // Read into the buffer.
             $buffer .= fread($input, 2048);
             // If we haven't reached the end of the file, seek to the last
             // space character and drop whatever is after that to make sure
             // we didn't truncate a term while reading the input.
             if (!feof($input)) {
                 // Find the last space character.
                 $ls = strrpos($buffer, ' ');
                 // Adjust string based on the last space character.
                 if ($ls) {
                     // Truncate the string to the last space character.
                     $string = substr($buffer, 0, $ls);
                     // Adjust the buffer based on the last space for the
                     // next iteration and trim.
                     $buffer = JString::trim(substr($buffer, $ls));
                 } else {
                     $string = $buffer;
                 }
             } else {
                 $string = $buffer;
             }
             // Parse the input.
             $string = FinderIndexerHelper::parse($string, $format);
             // Check the input.
             if (empty($string)) {
                 continue;
             }
             // Tokenize the input.
             $tokens = FinderIndexerHelper::tokenize($string, $lang);
             // Add the tokens to the database.
             $count += FinderIndexer::addTokensToDB($tokens, $context);
             // Check if we're approaching the memory limit of the token table.
             if ($count > self::$state->options->get('memory_table_limit', 30000)) {
                 FinderIndexer::toggleTables(false);
             }
             unset($string);
             unset($tokens);
         }
     } elseif (strlen($input) > 2048) {
         $start = 0;
         $end = strlen($input);
         $chunk = 2048;
         // As it turns out, the complex regular expressions we use for
         // sanitizing input are not very efficient when given large
         // strings. It is much faster to process lots of short strings.
         while ($start < $end) {
             // Setup the string.
             $string = substr($input, $start, $chunk);
             // Find the last space character if we aren't at the end.
             $ls = $start + $chunk < $end ? strrpos($string, ' ') : false;
             // Truncate to the last space character.
             if ($ls !== false) {
                 $string = substr($string, 0, $ls);
             }
             // Adjust the start position for the next iteration.
             $start += $ls !== false ? $ls + 1 - $chunk + $chunk : $chunk;
             // Parse the input.
             $string = FinderIndexerHelper::parse($string, $format);
             // Check the input.
             if (empty($string)) {
                 continue;
             }
             // Tokenize the input.
             $tokens = FinderIndexerHelper::tokenize($string, $lang);
             // Add the tokens to the database.
             $count += FinderIndexer::addTokensToDB($tokens, $context);
             // Check if we're approaching the memory limit of the token table.
             if ($count > self::$state->options->get('memory_table_limit', 30000)) {
                 FinderIndexer::toggleTables(false);
             }
         }
     } else {
         // Parse the input.
         $input = FinderIndexerHelper::parse($input, $format);
         // Check the input.
         if (empty($input)) {
             return $count;
         }
         // Tokenize the input.
         $tokens = FinderIndexerHelper::tokenize($input, $lang);
         // Add the tokens to the database.
         $count = FinderIndexer::addTokensToDB($tokens, $context);
     }
     return $count;
 }