/** * Method to tokenize a text string. * * @param string $input The input to tokenize. * @param string $lang The language of the input. * @param boolean $phrase Flag to indicate whether input could be a phrase. [optional] * * @return array An array of FinderIndexerToken objects. * * @since 2.5 */ public static function tokenize($input, $lang, $phrase = false) { static $cache; $store = JString::strlen($input) < 128 ? md5($input . '::' . $lang . '::' . $phrase) : null; // Check if the string has been tokenized already. if ($store && isset($cache[$store])) { return $cache[$store]; } $tokens = array(); $terms = array(); $quotes = html_entity_decode('‘’'', ENT_QUOTES, 'UTF-8'); // Get the simple language key. $lang = FinderIndexerHelper::getPrimaryLanguage($lang); /* * Parsing the string input into terms is a multi-step process. * * Regexes: * 1. Remove everything except letters, numbers, quotes, apostrophe, plus, dash, period, and comma. * 2. Remove plus, dash, period, and comma characters located before letter characters. * 3. Remove plus, dash, period, and comma characters located after other characters. * 4. Remove plus, period, and comma characters enclosed in alphabetical characters. Ungreedy. * 5. Remove orphaned apostrophe, plus, dash, period, and comma characters. * 6. Remove orphaned quote characters. * 7. Replace the assorted single quotation marks with the ASCII standard single quotation. * 8. Remove multiple space characters and replaces with a single space. */ $input = JString::strtolower($input); $input = preg_replace('#[^\\pL\\pM\\pN\\p{Pi}\\p{Pf}\'+-.,]+#mui', ' ', $input); $input = preg_replace('#(^|\\s)[+-.,]+([\\pL\\pM]+)#mui', ' $1', $input); $input = preg_replace('#([\\pL\\pM\\pN]+)[+-.,]+(\\s|$)#mui', '$1 ', $input); $input = preg_replace('#([\\pL\\pM]+)[+.,]+([\\pL\\pM]+)#muiU', '$1 $2', $input); // Ungreedy $input = preg_replace('#(^|\\s)[\'+-.,]+(\\s|$)#mui', ' ', $input); $input = preg_replace('#(^|\\s)[\\p{Pi}\\p{Pf}]+(\\s|$)#mui', ' ', $input); $input = preg_replace('#[' . $quotes . ']+#mui', '\'', $input); $input = preg_replace('#\\s+#mui', ' ', $input); $input = JString::trim($input); // Explode the normalized string to get the terms. $terms = explode(' ', $input); /* * If we have Unicode support and are dealing with Chinese text, Chinese * has to be handled specially because there are not necessarily any spaces * between the "words". So, we have to test if the words belong to the Chinese * character set and if so, explode them into single glyphs or "words". */ if ($lang === 'zh') { // Iterate through the terms and test if they contain Chinese. for ($i = 0, $n = count($terms); $i < $n; $i++) { $charMatches = array(); $charCount = preg_match_all('#[\\p{Han}]#mui', $terms[$i], $charMatches); // Split apart any groups of Chinese characters. for ($j = 0; $j < $charCount; $j++) { $tSplit = JString::str_ireplace($charMatches[0][$j], '', $terms[$i], false); if (!empty($tSplit)) { $terms[$i] = $tSplit; } else { unset($terms[$i]); } $terms[] = $charMatches[0][$j]; } } // Reset array keys. $terms = array_values($terms); } /* * If we have to handle the input as a phrase, that means we don't * tokenize the individual terms and we do not create the two and three * term combinations. The phrase must contain more than one word! */ if ($phrase === true && count($terms) > 1) { // Create tokens from the phrase. $tokens[] = new FinderIndexerToken($terms, $lang); } else { // Create tokens from the terms. for ($i = 0, $n = count($terms); $i < $n; $i++) { $tokens[] = new FinderIndexerToken($terms[$i], $lang); } // Create two and three word phrase tokens from the individual words. for ($i = 0, $n = count($tokens); $i < $n; $i++) { // Setup the phrase positions. $i2 = $i + 1; $i3 = $i + 2; // Create the two word phrase. if ($i2 < $n && isset($tokens[$i2])) { // Tokenize the two word phrase. $token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term), $lang, $lang === 'zh' ? '' : ' '); $token->derived = true; // Add the token to the stack. $tokens[] = $token; } // Create the three word phrase. if ($i3 < $n && isset($tokens[$i3])) { // Tokenize the three word phrase. $token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term, $tokens[$i3]->term), $lang, $lang === 'zh' ? '' : ' '); $token->derived = true; // Add the token to the stack. $tokens[] = $token; } } } if ($store) { $cache[$store] = count($tokens) > 1 ? $tokens : array_shift($tokens); return $cache[$store]; } else { return count($tokens) > 1 ? $tokens : array_shift($tokens); } }
/** * Method to instantiate the query object. * * @param array $options An array of query options. * * @since 2.5 * @throws Exception on database error. */ public function __construct($options) { // Get the input string. $this->input = isset($options['input']) ? $options['input'] : null; // Get the empty query setting. $this->empty = isset($options['empty']) ? (bool) $options['empty'] : false; // Get the input language. $this->language = !empty($options['language']) ? $options['language'] : FinderIndexerHelper::getDefaultLanguage(); $this->language = FinderIndexerHelper::getPrimaryLanguage($this->language); // Get the matching mode. $this->mode = 'AND'; // Initialize the temporary date storage. $this->dates = new JRegistry(); // Populate the temporary date storage. if (isset($options['date1']) && !empty($options['date1'])) { $this->dates->set('date1', $options['date1']); } if (isset($options['date2']) && !empty($options['date1'])) { $this->dates->set('date2', $options['date2']); } if (isset($options['when1']) && !empty($options['date1'])) { $this->dates->set('when1', $options['when1']); } if (isset($options['when2']) && !empty($options['date1'])) { $this->dates->set('when2', $options['when2']); } // Process the static taxonomy filters. if (isset($options['filter']) && !empty($options['filter'])) { $this->processStaticTaxonomy($options['filter']); } // Process the dynamic taxonomy filters. if (isset($options['filters']) && !empty($options['filters'])) { $this->processDynamicTaxonomy($options['filters']); } // Get the date filters. $d1 = $this->dates->get('date1'); $d2 = $this->dates->get('date2'); $w1 = $this->dates->get('when1'); $w2 = $this->dates->get('when2'); // Process the date filters. if (!empty($d1) || !empty($d2)) { $this->processDates($d1, $d2, $w1, $w2); } // Process the input string. $this->processString($this->input, $this->language, $this->mode); // Get the number of matching terms. foreach ($this->included as $token) { $this->terms += count($token->matches); } // Remove the temporary date storage. unset($this->dates); /* * Lastly, determine whether this query can return a result set. */ // Check if we have a query string. if (!empty($this->input)) { $this->search = true; } elseif ($this->empty && (!empty($this->filter) || !empty($this->filters) || !empty($this->date1) || !empty($this->date2))) { $this->search = true; } else { $this->search = false; } }
/** * Tests the getPrimaryLanguage method * * @return void * * @since 3.0 */ public function testGetPrimaryLanguage() { $this->assertThat(FinderIndexerHelper::getPrimaryLanguage('en-GB'), $this->StringContains('en'), 'The primary language is en'); }
/** * Method to auto-populate the model state. Calling getState in this method will result in recursion. * * @param string $ordering An optional ordering field. * @param string $direction An optional direction (asc|desc). * * @return void * * @since 2.5 */ protected function populateState($ordering = null, $direction = null) { // Get the configuration options. $app = JFactory::getApplication(); $input = $app->input; $params = JComponentHelper::getParams('com_finder'); $user = JFactory::getUser(); // Get the query input. $this->setState('input', $input->request->get('q', '', 'string')); // Set the query language if (JLanguageMultilang::isEnabled()) { $lang = JFactory::getLanguage()->getTag(); } else { $lang = FinderIndexerHelper::getDefaultLanguage(); } $lang = FinderIndexerHelper::getPrimaryLanguage($lang); $this->setState('language', $lang); // Load the list state. $this->setState('list.start', 0); $this->setState('list.limit', 10); // Load the parameters. $this->setState('params', $params); // Load the user state. $this->setState('user.id', (int) $user->get('id')); }
/** * Tests the getPrimaryLanguage method * * @return void * * @since 3.0 */ public function testGetPrimaryLanguage() { $this->assertEquals('en', FinderIndexerHelper::getPrimaryLanguage('en-GB'), 'The primary language is en'); }