/** * Method to parse input, tokenize it, and then add it to the database. * * @param mixed $input String or resource to use as input. A resource * input will automatically be chunked to conserve * memory. Strings will be chunked if longer than * 2K in size. * @param integer $context The context of the input. See context constants. * @param string $lang The language of the input. * @param string $format The format of the input. * * @return integer The number of tokens extracted from the input. * * @since 2.5 */ protected static function tokenizeToDB($input, $context, $lang, $format) { $count = 0; $buffer = null; // If the input is a resource, batch the process out. if (is_resource($input)) { // Batch the process out to avoid memory limits. while (!feof($input)) { // Read into the buffer. $buffer .= fread($input, 2048); // If we haven't reached the end of the file, seek to the last // space character and drop whatever is after that to make sure // we didn't truncate a term while reading the input. if (!feof($input)) { // Find the last space character. $ls = strrpos($buffer, ' '); // Adjust string based on the last space character. if ($ls) { // Truncate the string to the last space character. $string = substr($buffer, 0, $ls); // Adjust the buffer based on the last space for the // next iteration and trim. $buffer = JString::trim(substr($buffer, $ls)); } else { $string = $buffer; } } else { $string = $buffer; } // Parse the input. $string = FinderIndexerHelper::parse($string, $format); // Check the input. if (empty($string)) { continue; } // Tokenize the input. $tokens = FinderIndexerHelper::tokenize($string, $lang); // Add the tokens to the database. $count += FinderIndexer::addTokensToDB($tokens, $context); // Check if we're approaching the memory limit of the token table. if ($count > self::$state->options->get('memory_table_limit', 30000)) { FinderIndexer::toggleTables(false); } unset($string); unset($tokens); } } elseif (strlen($input) > 2048) { $start = 0; $end = strlen($input); $chunk = 2048; // As it turns out, the complex regular expressions we use for // sanitizing input are not very efficient when given large // strings. It is much faster to process lots of short strings. while ($start < $end) { // Setup the string. $string = substr($input, $start, $chunk); // Find the last space character if we aren't at the end. $ls = $start + $chunk < $end ? strrpos($string, ' ') : false; // Truncate to the last space character. if ($ls !== false) { $string = substr($string, 0, $ls); } // Adjust the start position for the next iteration. $start += $ls !== false ? $ls + 1 - $chunk + $chunk : $chunk; // Parse the input. $string = FinderIndexerHelper::parse($string, $format); // Check the input. if (empty($string)) { continue; } // Tokenize the input. $tokens = FinderIndexerHelper::tokenize($string, $lang); // Add the tokens to the database. $count += FinderIndexer::addTokensToDB($tokens, $context); // Check if we're approaching the memory limit of the token table. if ($count > self::$state->options->get('memory_table_limit', 30000)) { FinderIndexer::toggleTables(false); } } } else { // Parse the input. $input = FinderIndexerHelper::parse($input, $format); // Check the input. if (empty($input)) { return $count; } // Tokenize the input. $tokens = FinderIndexerHelper::tokenize($input, $lang); // Add the tokens to the database. $count = FinderIndexer::addTokensToDB($tokens, $context); } return $count; }