예제 #1
0
 /**
  * Method to tokenize a text string.
  *
  * @param   string   $input   The input to tokenize.
  * @param   string   $lang    The language of the input.
  * @param   boolean  $phrase  Flag to indicate whether input could be a phrase. [optional]
  *
  * @return  array  An array of FinderIndexerToken objects.
  *
  * @since   2.5
  */
 public static function tokenize($input, $lang, $phrase = false)
 {
     static $cache;
     $store = JString::strlen($input) < 128 ? md5($input . '::' . $lang . '::' . $phrase) : null;
     // Check if the string has been tokenized already.
     if ($store && isset($cache[$store])) {
         return $cache[$store];
     }
     $tokens = array();
     $terms = array();
     $quotes = html_entity_decode('&#8216;&#8217;&#39;', ENT_QUOTES, 'UTF-8');
     // Get the simple language key.
     $lang = FinderIndexerHelper::getPrimaryLanguage($lang);
     /*
      * Parsing the string input into terms is a multi-step process.
      *
      * Regexes:
      *	1. Remove everything except letters, numbers, quotes, apostrophe, plus, dash, period, and comma.
      *	2. Remove plus, dash, period, and comma characters located before letter characters.
      *  3. Remove plus, dash, period, and comma characters located after other characters.
      *  4. Remove plus, period, and comma characters enclosed in alphabetical characters. Ungreedy.
      *  5. Remove orphaned apostrophe, plus, dash, period, and comma characters.
      *  6. Remove orphaned quote characters.
      *  7. Replace the assorted single quotation marks with the ASCII standard single quotation.
      *  8. Remove multiple space characters and replaces with a single space.
      */
     $input = JString::strtolower($input);
     $input = preg_replace('#[^\\pL\\pM\\pN\\p{Pi}\\p{Pf}\'+-.,]+#mui', ' ', $input);
     $input = preg_replace('#(^|\\s)[+-.,]+([\\pL\\pM]+)#mui', ' $1', $input);
     $input = preg_replace('#([\\pL\\pM\\pN]+)[+-.,]+(\\s|$)#mui', '$1 ', $input);
     $input = preg_replace('#([\\pL\\pM]+)[+.,]+([\\pL\\pM]+)#muiU', '$1 $2', $input);
     // Ungreedy
     $input = preg_replace('#(^|\\s)[\'+-.,]+(\\s|$)#mui', ' ', $input);
     $input = preg_replace('#(^|\\s)[\\p{Pi}\\p{Pf}]+(\\s|$)#mui', ' ', $input);
     $input = preg_replace('#[' . $quotes . ']+#mui', '\'', $input);
     $input = preg_replace('#\\s+#mui', ' ', $input);
     $input = JString::trim($input);
     // Explode the normalized string to get the terms.
     $terms = explode(' ', $input);
     /*
      * If we have Unicode support and are dealing with Chinese text, Chinese
      * has to be handled specially because there are not necessarily any spaces
      * between the "words". So, we have to test if the words belong to the Chinese
      * character set and if so, explode them into single glyphs or "words".
      */
     if ($lang === 'zh') {
         // Iterate through the terms and test if they contain Chinese.
         for ($i = 0, $n = count($terms); $i < $n; $i++) {
             $charMatches = array();
             $charCount = preg_match_all('#[\\p{Han}]#mui', $terms[$i], $charMatches);
             // Split apart any groups of Chinese characters.
             for ($j = 0; $j < $charCount; $j++) {
                 $tSplit = JString::str_ireplace($charMatches[0][$j], '', $terms[$i], false);
                 if (!empty($tSplit)) {
                     $terms[$i] = $tSplit;
                 } else {
                     unset($terms[$i]);
                 }
                 $terms[] = $charMatches[0][$j];
             }
         }
         // Reset array keys.
         $terms = array_values($terms);
     }
     /*
      * If we have to handle the input as a phrase, that means we don't
      * tokenize the individual terms and we do not create the two and three
      * term combinations. The phrase must contain more than one word!
      */
     if ($phrase === true && count($terms) > 1) {
         // Create tokens from the phrase.
         $tokens[] = new FinderIndexerToken($terms, $lang);
     } else {
         // Create tokens from the terms.
         for ($i = 0, $n = count($terms); $i < $n; $i++) {
             $tokens[] = new FinderIndexerToken($terms[$i], $lang);
         }
         // Create two and three word phrase tokens from the individual words.
         for ($i = 0, $n = count($tokens); $i < $n; $i++) {
             // Setup the phrase positions.
             $i2 = $i + 1;
             $i3 = $i + 2;
             // Create the two word phrase.
             if ($i2 < $n && isset($tokens[$i2])) {
                 // Tokenize the two word phrase.
                 $token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term), $lang, $lang === 'zh' ? '' : ' ');
                 $token->derived = true;
                 // Add the token to the stack.
                 $tokens[] = $token;
             }
             // Create the three word phrase.
             if ($i3 < $n && isset($tokens[$i3])) {
                 // Tokenize the three word phrase.
                 $token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term, $tokens[$i3]->term), $lang, $lang === 'zh' ? '' : ' ');
                 $token->derived = true;
                 // Add the token to the stack.
                 $tokens[] = $token;
             }
         }
     }
     if ($store) {
         $cache[$store] = count($tokens) > 1 ? $tokens : array_shift($tokens);
         return $cache[$store];
     } else {
         return count($tokens) > 1 ? $tokens : array_shift($tokens);
     }
 }
예제 #2
0
파일: query.php 프로젝트: 01J/topm
 /**
  * Method to instantiate the query object.
  *
  * @param   array  $options  An array of query options.
  *
  * @since   2.5
  * @throws  Exception on database error.
  */
 public function __construct($options)
 {
     // Get the input string.
     $this->input = isset($options['input']) ? $options['input'] : null;
     // Get the empty query setting.
     $this->empty = isset($options['empty']) ? (bool) $options['empty'] : false;
     // Get the input language.
     $this->language = !empty($options['language']) ? $options['language'] : FinderIndexerHelper::getDefaultLanguage();
     $this->language = FinderIndexerHelper::getPrimaryLanguage($this->language);
     // Get the matching mode.
     $this->mode = 'AND';
     // Initialize the temporary date storage.
     $this->dates = new JRegistry();
     // Populate the temporary date storage.
     if (isset($options['date1']) && !empty($options['date1'])) {
         $this->dates->set('date1', $options['date1']);
     }
     if (isset($options['date2']) && !empty($options['date1'])) {
         $this->dates->set('date2', $options['date2']);
     }
     if (isset($options['when1']) && !empty($options['date1'])) {
         $this->dates->set('when1', $options['when1']);
     }
     if (isset($options['when2']) && !empty($options['date1'])) {
         $this->dates->set('when2', $options['when2']);
     }
     // Process the static taxonomy filters.
     if (isset($options['filter']) && !empty($options['filter'])) {
         $this->processStaticTaxonomy($options['filter']);
     }
     // Process the dynamic taxonomy filters.
     if (isset($options['filters']) && !empty($options['filters'])) {
         $this->processDynamicTaxonomy($options['filters']);
     }
     // Get the date filters.
     $d1 = $this->dates->get('date1');
     $d2 = $this->dates->get('date2');
     $w1 = $this->dates->get('when1');
     $w2 = $this->dates->get('when2');
     // Process the date filters.
     if (!empty($d1) || !empty($d2)) {
         $this->processDates($d1, $d2, $w1, $w2);
     }
     // Process the input string.
     $this->processString($this->input, $this->language, $this->mode);
     // Get the number of matching terms.
     foreach ($this->included as $token) {
         $this->terms += count($token->matches);
     }
     // Remove the temporary date storage.
     unset($this->dates);
     /*
      * Lastly, determine whether this query can return a result set.
      */
     // Check if we have a query string.
     if (!empty($this->input)) {
         $this->search = true;
     } elseif ($this->empty && (!empty($this->filter) || !empty($this->filters) || !empty($this->date1) || !empty($this->date2))) {
         $this->search = true;
     } else {
         $this->search = false;
     }
 }
 /**
  * Tests the getPrimaryLanguage method
  *
  * @return  void
  *
  * @since   3.0
  */
 public function testGetPrimaryLanguage()
 {
     $this->assertThat(FinderIndexerHelper::getPrimaryLanguage('en-GB'), $this->StringContains('en'), 'The primary language is en');
 }
예제 #4
0
 /**
  * Method to auto-populate the model state.  Calling getState in this method will result in recursion.
  *
  * @param   string  $ordering   An optional ordering field.
  * @param   string  $direction  An optional direction (asc|desc).
  *
  * @return  void
  *
  * @since   2.5
  */
 protected function populateState($ordering = null, $direction = null)
 {
     // Get the configuration options.
     $app = JFactory::getApplication();
     $input = $app->input;
     $params = JComponentHelper::getParams('com_finder');
     $user = JFactory::getUser();
     // Get the query input.
     $this->setState('input', $input->request->get('q', '', 'string'));
     // Set the query language
     if (JLanguageMultilang::isEnabled()) {
         $lang = JFactory::getLanguage()->getTag();
     } else {
         $lang = FinderIndexerHelper::getDefaultLanguage();
     }
     $lang = FinderIndexerHelper::getPrimaryLanguage($lang);
     $this->setState('language', $lang);
     // Load the list state.
     $this->setState('list.start', 0);
     $this->setState('list.limit', 10);
     // Load the parameters.
     $this->setState('params', $params);
     // Load the user state.
     $this->setState('user.id', (int) $user->get('id'));
 }
 /**
  * Tests the getPrimaryLanguage method
  *
  * @return  void
  *
  * @since   3.0
  */
 public function testGetPrimaryLanguage()
 {
     $this->assertEquals('en', FinderIndexerHelper::getPrimaryLanguage('en-GB'), 'The primary language is en');
 }