예제 #1
0
 /**
  * Method to index a content item.
  *
  * @param   FinderIndexerResult  $item    The content item to index.
  * @param   string               $format  The format of the content. [optional]
  *
  * @return  integer  The ID of the record in the links table.
  *
  * @since   2.5
  * @throws  Exception on database error.
  */
 public static function index($item, $format = 'html')
 {
     // Mark beforeIndexing in the profiler.
     self::$profiler ? self::$profiler->mark('beforeIndexing') : null;
     $db = JFactory::getDBO();
     $nd = $db->getNullDate();
     // Check if the item is in the database.
     $query = $db->getQuery(true);
     $query->select($db->quoteName('link_id') . ', ' . $db->quoteName('md5sum'));
     $query->from($db->quoteName('#__finder_links'));
     $query->where($db->quoteName('url') . ' = ' . $db->quote($item->url));
     // Load the item  from the database.
     $db->setQuery($query);
     $link = $db->loadObject();
     // Check for a database error.
     if ($db->getErrorNum()) {
         // Throw database error exception.
         throw new Exception($db->getErrorMsg(), 500);
     }
     // Get the indexer state.
     $state = FinderIndexer::getState();
     // Get the signatures of the item.
     $curSig = self::getSignature($item);
     $oldSig = isset($link->md5sum) ? $link->md5sum : null;
     // Get the other item information.
     $linkId = empty($link->link_id) ? null : $link->link_id;
     $isNew = empty($link->link_id) ? true : false;
     // Check the signatures. If they match, the item is up to date.
     if (!$isNew && $curSig == $oldSig) {
         return $linkId;
     }
     /*
      * If the link already exists, flush all the term maps for the item.
      * Maps are stored in 16 tables so we need to iterate through and flush
      * each table one at a time.
      */
     if (!$isNew) {
         for ($i = 0; $i <= 15; $i++) {
             // Flush the maps for the link.
             $query->clear();
             $query->delete();
             $query->from($db->quoteName('#__finder_links_terms' . dechex($i)));
             $query->where($db->quoteName('link_id') . ' = ' . (int) $linkId);
             $db->setQuery($query);
             $db->query();
             // Check for a database error.
             if ($db->getErrorNum()) {
                 // Throw database error exception.
                 throw new Exception($db->getErrorMsg(), 500);
             }
         }
         // Remove the taxonomy maps.
         FinderIndexerTaxonomy::removeMaps($linkId);
     }
     // Mark afterUnmapping in the profiler.
     self::$profiler ? self::$profiler->mark('afterUnmapping') : null;
     // Perform cleanup on the item data.
     $item->publish_start_date = intval($item->publish_start_date) != 0 ? $item->publish_start_date : $nd;
     $item->publish_end_date = intval($item->publish_end_date) != 0 ? $item->publish_end_date : $nd;
     $item->start_date = intval($item->start_date) != 0 ? $item->start_date : $nd;
     $item->end_date = intval($item->end_date) != 0 ? $item->end_date : $nd;
     // Prepare the item description.
     $item->description = FinderIndexerHelper::parse($item->summary);
     /*
      * Now, we need to enter the item into the links table. If the item
      * already exists in the database, we need to use an UPDATE query.
      * Otherwise, we need to use an INSERT to get the link id back.
      */
     if ($isNew) {
         $columnsArray = array($db->quoteName('url'), $db->quoteName('route'), $db->quoteName('title'), $db->quoteName('description'), $db->quoteName('indexdate'), $db->quoteName('published'), $db->quoteName('state'), $db->quoteName('access'), $db->quoteName('language'), $db->quoteName('type_id'), $db->quoteName('object'), $db->quoteName('publish_start_date'), $db->quoteName('publish_end_date'), $db->quoteName('start_date'), $db->quoteName('end_date'), $db->quoteName('list_price'), $db->quoteName('sale_price'));
         // Insert the link.
         $query->clear();
         $query->insert($db->quoteName('#__finder_links'));
         $query->columns($columnsArray);
         $query->values($db->quote($item->url) . ', ' . $db->quote($item->route) . ', ' . $db->quote($item->title) . ', ' . $db->quote($item->description) . ', ' . $query->currentTimestamp() . ', ' . '1, ' . (int) $item->state . ', ' . (int) $item->access . ', ' . $db->quote($item->language) . ', ' . (int) $item->type_id . ', ' . $db->quote(serialize($item)) . ', ' . $db->quote($item->publish_start_date) . ', ' . $db->quote($item->publish_end_date) . ', ' . $db->quote($item->start_date) . ', ' . $db->quote($item->end_date) . ', ' . $db->quote($item->list_price) . ', ' . $db->quote($item->sale_price));
         $db->setQuery($query);
         $db->query();
         // Check for a database error.
         if ($db->getErrorNum()) {
             // Throw database error exception.
             throw new Exception($db->getErrorMsg(), 500);
         }
         // Get the link id.
         $linkId = (int) $db->insertid();
     } else {
         // Update the link.
         //@TODO: Implement this
         $query->clear();
         $query->update($db->qn('#__finder_links'));
         $query->set($db->qn('route') . ' = ' . $db->quote($item->route));
         $query->set($db->qn('title') . ' = ' . $db->quote($item->title));
         $query->set($db->qn('description') . ' = ' . $db->quote($item->description));
         $query->set($db->qn('indexdate') . ' = ' . $query->currentTimestamp());
         $query->set($db->qn('state') . ' = ' . (int) $item->state);
         $query->set($db->qn('access') . ' = ' . (int) $item->access);
         $query->set($db->qn('language') . ' = ' . $db->quote($item->language));
         $query->set($db->qn('type_id') . ' = ' . (int) $item->type_id);
         $query->set($db->qn('object') . ' = ' . $db->quote(serialize($item)));
         $query->set($db->qn('publish_start_date') . ' = ' . $db->quote($item->publish_start_date));
         $query->set($db->qn('publish_end_date') . ' = ' . $db->quote($item->publish_end_date));
         $query->set($db->qn('start_date') . ' = ' . $db->quote($item->start_date));
         $query->set($db->qn('end_date') . ' = ' . $db->quote($item->end_date));
         $query->set($db->qn('list_price') . ' = ' . $db->quote($item->list_price));
         $query->set($db->qn('sale_price') . ' = ' . $db->quote($item->sale_price));
         $query->where('link_id = ' . (int) $linkId);
         $db->setQuery($query);
         $db->query();
         // Check for a database error.
         if ($db->getErrorNum()) {
             // Throw database error exception.
             throw new Exception($db->getErrorMsg(), 500);
         }
     }
     // Set up the variables we will need during processing.
     $tokens = array();
     $count = 0;
     // Mark afterLinking in the profiler.
     self::$profiler ? self::$profiler->mark('afterLinking') : null;
     // Truncate the tokens tables.
     $db->truncateTable('#__finder_tokens');
     // Check for a database error.
     if ($db->getErrorNum()) {
         // Throw database error exception.
         throw new Exception($db->getErrorMsg(), 500);
     }
     // Truncate the tokens aggregate table.
     $db->truncateTable('#__finder_tokens_aggregate');
     // Check for a database error.
     if ($db->getErrorNum()) {
         // Throw database error exception.
         throw new Exception($db->getErrorMsg(), 500);
     }
     /*
      * Process the item's content. The items can customize their
      * processing instructions to define extra properties to process
      * or rearrange how properties are weighted.
      */
     foreach ($item->getInstructions() as $group => $properties) {
         // Iterate through the properties of the group.
         foreach ($properties as $property) {
             // Check if the property exists in the item.
             if (empty($item->{$property})) {
                 continue;
             }
             // Tokenize the property.
             if (is_array($item->{$property})) {
                 // Tokenize an array of content and add it to the database.
                 foreach ($item->{$property} as $ip) {
                     // If the group is path, we need to a few extra processing
                     // steps to strip the extension and convert slashes and dashes
                     // to spaces.
                     if ($group === self::PATH_CONTEXT) {
                         $ip = JFile::stripExt($ip);
                         $ip = str_replace('/', ' ', $ip);
                         $ip = str_replace('-', ' ', $ip);
                     }
                     // Tokenize a string of content and add it to the database.
                     $count += FinderIndexer::tokenizeToDB($ip, $group, $item->language, $format);
                     // Check if we're approaching the memory limit of the token table.
                     if ($count > self::$state->options->get('memory_table_limit', 30000)) {
                         FinderIndexer::toggleTables(false);
                     }
                 }
             } else {
                 // If the group is path, we need to a few extra processing
                 // steps to strip the extension and convert slashes and dashes
                 // to spaces.
                 if ($group === self::PATH_CONTEXT) {
                     $item->{$property} = JFile::stripExt($item->{$property});
                     $item->{$property} = str_replace('/', ' ', $item->{$property});
                     $item->{$property} = str_replace('-', ' ', $item->{$property});
                 }
                 // Tokenize a string of content and add it to the database.
                 $count += FinderIndexer::tokenizeToDB($item->{$property}, $group, $item->language, $format);
                 // Check if we're approaching the memory limit of the token table.
                 if ($count > self::$state->options->get('memory_table_limit', 30000)) {
                     FinderIndexer::toggleTables(false);
                 }
             }
         }
     }
     /*
      * Process the item's taxonomy. The items can customize their
      * taxonomy mappings to define extra properties to map.
      */
     foreach ($item->getTaxonomy() as $branch => $nodes) {
         // Iterate through the nodes and map them to the branch.
         foreach ($nodes as $node) {
             // Add the node to the tree.
             $nodeId = FinderIndexerTaxonomy::addNode($branch, $node->title, $node->state, $node->access);
             // Add the link => node map.
             FinderIndexerTaxonomy::addMap($linkId, $nodeId);
             // Tokenize the node title and add them to the database.
             $count += FinderIndexer::tokenizeToDB($node->title, self::META_CONTEXT, $item->language, $format);
         }
     }
     // Mark afterProcessing in the profiler.
     self::$profiler ? self::$profiler->mark('afterProcessing') : null;
     /*
      * At this point, all of the item's content has been parsed, tokenized
      * and inserted into the #__finder_tokens table. Now, we need to
      * aggregate all the data into that table into a more usable form. The
      * aggregated data will be inserted into #__finder_tokens_aggregate
      * table.
      */
     $query = 'INSERT INTO ' . $db->quoteName('#__finder_tokens_aggregate') . ' (' . $db->quoteName('term_id') . ', ' . $db->quoteName('term') . ', ' . $db->quoteName('stem') . ', ' . $db->quoteName('common') . ', ' . $db->quoteName('phrase') . ', ' . $db->quoteName('term_weight') . ', ' . $db->quoteName('context') . ', ' . $db->quoteName('context_weight') . ')' . ' SELECT' . ' t.term_id, t1.term, t1.stem, t1.common, t1.phrase, t1.weight, t1.context,' . ' ROUND( t1.weight * COUNT( t2.term ) * %F, 8 ) AS context_weight' . ' FROM (' . '   SELECT DISTINCT t1.term, t1.stem, t1.common, t1.phrase, t1.weight, t1.context' . '   FROM ' . $db->quoteName('#__finder_tokens') . ' AS t1' . '   WHERE t1.context = %d' . ' ) AS t1' . ' JOIN ' . $db->quoteName('#__finder_tokens') . ' AS t2 ON t2.term = t1.term' . ' LEFT JOIN ' . $db->quoteName('#__finder_terms') . ' AS t ON t.term = t1.term' . ' WHERE t2.context = %d' . ' GROUP BY t1.term' . ' ORDER BY t1.term DESC';
     // Iterate through the contexts and aggregate the tokens per context.
     foreach ($state->weights as $context => $multiplier) {
         // Run the query to aggregate the tokens for this context..
         $db->setQuery(sprintf($query, $multiplier, $context, $context));
         $db->query();
         // Check for a database error.
         if ($db->getErrorNum()) {
             // Throw database error exception.
             throw new Exception($db->getErrorMsg(), 500);
         }
     }
     // Mark afterAggregating in the profiler.
     self::$profiler ? self::$profiler->mark('afterAggregating') : null;
     /*
      * When we pulled down all of the aggregate data, we did a LEFT JOIN
      * over the terms table to try to find all the term ids that
      * already exist for our tokens. If any of the rows in the aggregate
      * table have a term of 0, then no term record exists for that
      * term so we need to add it to the terms table.
      */
     //@TODO: PostgreSQL doesn't support SOUNDEX out of the box
     /* This edit is causing the indexer to fail.
     		$queryInsIgn = 'INSERT INTO ' . $db->quoteName('#__finder_terms') .
     						' (' . $db->quoteName('term') .
     						', ' . $db->quoteName('stem') .
     						', ' . $db->quoteName('common') .
     						', ' . $db->quoteName('phrase') .
     						', ' . $db->quoteName('weight') .
     						', ' . $db->quoteName('soundex') . ')' .
     						' SELECT ta.term, ta.stem, ta.common, ta.phrase, ta.term_weight, SOUNDEX(ta.term)' .
     						' FROM ' . $db->quoteName('#__finder_tokens_aggregate') . ' AS ta' .
     						' WHERE 1 NOT IN ' .
     								'( SELECT 1 FROM ' . $db->quoteName('#__finder_terms') .
     								' WHERE ta.term_id = 0 )' .
     						' AND ta.term_id = 0' .
     						' GROUP BY ta.term';
     
     		$db->setQuery($queryInsIgn);
     		$db->query();
     
     		// Check for a database error.
     		if ($db->getErrorNum())
     		{
     			//@TODO: PostgreSQL doesn't support SOUNDEX out of the box
     			$query->clear();
     			$query->select('ta.term, ta.stem, ta.common, ta.phrase, ta.term_weight, SOUNDEX(ta.term)')
     					->from($db->quoteName('#__finder_tokens_aggregate') . ' AS ta')
     					->where('ta.term_id = 0');
     			$db->setQuery($query);
     			$subQuVal = $db->loadObject();
     
     			$quRepl_p1 = 'UPDATE ' . $db->quoteName('#__finder_terms') . ' AS ta' .
     							' SET ' .
     								' (' . $db->quoteName('term') .
     								', ' . $db->quoteName('stem') .
     								', ' . $db->quoteName('common') .
     								', ' . $db->quoteName('phrase') .
     								', ' . $db->quoteName('weight') .
     								', ' . $db->quoteName('soundex') . ')' .
     							' = ' .
     								' (' . $db->quote($subQuVal->term) .
     								', ' . $db->quote($subQuVal->stem) .
     								', ' . $db->quote($subQuVal->common) .
     								', ' . $db->quote($subQuVal->phrase) .
     								', ' . $db->quote($subQuVal->weight) .
     								', ' . $db->quote($subQuVal->soundex) . ')' .
     							' WHERE ' .
     									$db->quoteName('term') . ' = ' . $db->quote($subQuVal->term) . ' AND ' .
     									$db->quoteName('stem') . ' = ' . $db->quote($subQuVal->stem) . ' AND ' .
     									$db->quoteName('common') . ' = ' . $db->quote($subQuVal->common) . ' AND ' .
     									$db->quoteName('phrase') . ' = ' . $db->quote($subQuVal->phrase) . ' AND ' .
     									$db->quoteName('weight') . ' = ' . $db->quote($subQuVal->weight) . ' AND ' .
     									$db->quoteName('soundex') . ' = ' . $db->quote($subQuVal->soundex);
     
     			$db->setQuery($quRepl_p1);
     			$db->query();
     
     			$quRepl_p2 = 'INSERT INTO ' . $db->quoteName('#__finder_terms') .
     						' (' . $db->quoteName('term') .
     								', ' . $db->quoteName('stem') .
     								', ' . $db->quoteName('common') .
     								', ' . $db->quoteName('phrase') .
     								', ' . $db->quoteName('weight') .
     								', ' . $db->quoteName('soundex') . ')' .
     						' SELECT ta.term, ta.stem, ta.common, ta.phrase, ta.term_weight, SOUNDEX(ta.term)' .
     						' FROM ' . $db->quoteName('#__finder_tokens_aggregate') . ' AS ta' .
     						' WHERE 1 NOT IN ' .
     								'( SELECT 1 FROM ' . $db->quoteName('#__finder_terms') .
     								' WHERE ta.term_id = 0 )' .
     						' AND ta.term_id = 0' .
     						' GROUP BY ta.term';
     
     			$db->setQuery($quRepl_p2);
     			$db->query();
     
     			// Check for a database error.
     			if ($db->getErrorNum())
     			{
     				throw new Exception($db->getErrorMsg(), 500);
     			}
     		}
     		End of failing edit */
     //@TODO: PostgreSQL doesn't support INSERT IGNORE INTO
     //@TODO: PostgreSQL doesn't support SOUNDEX out of the box
     $db->setQuery('INSERT IGNORE INTO ' . $db->quoteName('#__finder_terms') . ' (' . $db->quoteName('term') . ', ' . $db->quoteName('stem') . ', ' . $db->quoteName('common') . ', ' . $db->quoteName('phrase') . ', ' . $db->quoteName('weight') . ', ' . $db->quoteName('soundex') . ')' . ' SELECT ta.term, ta.stem, ta.common, ta.phrase, ta.term_weight, SOUNDEX(ta.term)' . ' FROM ' . $db->quoteName('#__finder_tokens_aggregate') . ' AS ta' . ' WHERE ta.term_id = 0' . ' GROUP BY ta.term');
     $db->query();
     // Check for a database error.
     if ($db->getErrorNum()) {
         throw new Exception($db->getErrorMsg(), 500);
     }
     /*
      * Now, we just inserted a bunch of new records into the terms table
      * so we need to go back and update the aggregate table with all the
      * new term ids.
      */
     $query = $db->getQuery(true);
     $query->update($db->quoteName('#__finder_tokens_aggregate') . ' AS ta');
     $query->join('INNER', $db->quoteName('#__finder_terms') . ' AS t ON t.term = ta.term');
     $query->set('ta.term_id = t.term_id');
     $query->where('ta.term_id = 0');
     $db->setQuery($query);
     $db->query();
     // Check for a database error.
     if ($db->getErrorNum()) {
         // Throw database error exception.
         throw new Exception($db->getErrorMsg(), 500);
     }
     // Mark afterTerms in the profiler.
     self::$profiler ? self::$profiler->mark('afterTerms') : null;
     /*
      * After we've made sure that all of the terms are in the terms table
      * and the aggregate table has the correct term ids, we need to update
      * the links counter for each term by one.
      */
     $query->clear();
     $query->update($db->quoteName('#__finder_terms') . ' AS t');
     $query->join('INNER', $db->quoteName('#__finder_tokens_aggregate') . ' AS ta ON ta.term_id = t.term_id');
     $query->set('t.' . $db->quoteName('links') . ' = t.links + 1');
     $db->setQuery($query);
     $db->query();
     // Check for a database error.
     if ($db->getErrorNum()) {
         // Throw database error exception.
         throw new Exception($db->getErrorMsg(), 500);
     }
     // Mark afterTerms in the profiler.
     self::$profiler ? self::$profiler->mark('afterTerms') : null;
     /*
      * Before we can insert all of the mapping rows, we have to figure out
      * which mapping table the rows need to be inserted into. The mapping
      * table for each term is based on the first character of the md5 of
      * the first character of the term. In php, it would be expressed as
      * substr(md5(substr($token, 0, 1)), 0, 1)
      */
     $query->clear();
     $query->update($db->quoteName('#__finder_tokens_aggregate'));
     $query->set($db->quoteName('map_suffix') . ' = SUBSTR(MD5(SUBSTR(' . $db->quoteName('term') . ', 1, 1)), 1, 1)');
     $db->setQuery($query);
     $db->query();
     // Check for a database error.
     if ($db->getErrorNum()) {
         // Throw database error exception.
         throw new Exception($db->getErrorMsg(), 500);
     }
     /*
      * At this point, the aggregate table contains a record for each
      * term in each context. So, we're going to pull down all of that
      * data while grouping the records by term and add all of the
      * sub-totals together to arrive at the final total for each token for
      * this link. Then, we insert all of that data into the appropriate
      * mapping table.
      */
     for ($i = 0; $i <= 15; $i++) {
         // Get the mapping table suffix.
         $suffix = dechex($i);
         /*
          * We have to run this query 16 times, one for each link => term
          * mapping table.
          */
         //@TODO: Convert to JDatabaseQuery
         $db->setQuery('INSERT INTO ' . $db->quoteName('#__finder_links_terms' . $suffix) . ' (' . $db->quoteName('link_id') . ', ' . $db->quoteName('term_id') . ', ' . $db->quoteName('weight') . ')' . ' SELECT ' . (int) $linkId . ', ' . $db->quoteName('term_id') . ',' . ' ROUND(SUM(' . $db->quoteName('context_weight') . '), 8)' . ' FROM ' . $db->quoteName('#__finder_tokens_aggregate') . ' WHERE ' . $db->quoteName('map_suffix') . ' = ' . $db->quote($suffix) . ' GROUP BY ' . $db->quoteName('term') . ' ORDER BY ' . $db->quoteName('term') . ' DESC');
         $db->query();
         // Check for a database error.
         if ($db->getErrorNum()) {
             // Throw database error exception.
             throw new Exception($db->getErrorMsg(), 500);
         }
     }
     // Mark afterMapping in the profiler.
     self::$profiler ? self::$profiler->mark('afterMapping') : null;
     // Update the signature.
     $query->clear();
     $query->update($db->quoteName('#__finder_links'));
     $query->set($db->quoteName('md5sum') . ' = ' . $db->quote($curSig));
     $query->where($db->quoteName('link_id') . ' = ' . $db->quote($linkId));
     $db->setQuery($query);
     $db->query();
     // Check for a database error.
     if ($db->getErrorNum()) {
         // Throw database error exception.
         throw new Exception($db->getErrorMsg(), 500);
     }
     // Mark afterSigning in the profiler.
     self::$profiler ? self::$profiler->mark('afterSigning') : null;
     // Truncate the tokens tables.
     $db->truncateTable('#__finder_tokens');
     // Check for a database error.
     if ($db->getErrorNum()) {
         // Throw database error exception.
         throw new Exception($db->getErrorMsg(), 500);
     }
     // Truncate the tokens aggregate table.
     $db->truncateTable('#__finder_tokens_aggregate');
     // Check for a database error.
     if ($db->getErrorNum()) {
         // Throw database error exception.
         throw new Exception($db->getErrorMsg(), 500);
     }
     // Toggle the token tables back to memory tables.
     FinderIndexer::toggleTables(true);
     // Mark afterTruncating in the profiler.
     self::$profiler ? self::$profiler->mark('afterTruncating') : null;
     return $linkId;
 }