/** * Index a document. * * See Pluf_Search for the disclaimer and informations. * * @param Pluf_Model Document to index. * @param Stemmer used. ('Pluf_Text_Stemmer_Porter') * @return array Statistics. */ public static function index($doc, $stemmer = 'Pluf_Text_Stemmer_Porter') { $words = Pluf_Text::tokenize($doc->_toIndex()); if ($stemmer != null) { $words = self::stem($words, $stemmer); } // Get the total number of words. $total = 0.0; $words_flat = array(); foreach ($words as $word => $occ) { $total += (double) $occ; $words_flat[] = $word; } // Drop the last indexation. $gocc = new IDF_Search_Occ(); $sql = new Pluf_SQL('DELETE FROM ' . $gocc->getSqlTable() . ' WHERE model_class=%s AND model_id=%s', array($doc->_model, $doc->id)); $db =& Pluf::db(); $db->execute($sql->gen()); // Get the ids for each word. $ids = self::getWordIds($words_flat); // Insert a new word for the missing words and add the occ. $n = count($ids); $new_words = 0; $done = array(); for ($i = 0; $i < $n; $i++) { if ($ids[$i] === null) { $word = new Pluf_Search_Word(); $word->word = $words_flat[$i]; try { $word->create(); $new_words++; $ids[$i] = $word->id; } catch (Exception $e) { // 100% of the time, the word has been created // by another process in the background. $r_ids = self::getWordIds(array($word->word)); if ($r_ids[0]) { $ids[$i] = $r_ids[0]; } else { // give up for this word continue; } } } if (isset($done[$ids[$i]])) { continue; } $done[$ids[$i]] = true; $occ = new IDF_Search_Occ(); $occ->word = new Pluf_Search_Word($ids[$i]); $occ->model_class = $doc->_model; $occ->model_id = $doc->id; $occ->project = $doc->get_project(); $occ->occ = $words[$words_flat[$i]]; $occ->pondocc = $words[$words_flat[$i]] / $total; $occ->create(); } // update the stats $sql = new Pluf_SQL('model_class=%s AND model_id=%s', array($doc->_model, $doc->id)); $last_index = Pluf::factory('Pluf_Search_Stats')->getList(array('filter' => $sql->gen())); if ($last_index->count() == 0) { $stats = new Pluf_Search_Stats(); $stats->model_class = $doc->_model; $stats->model_id = $doc->id; $stats->indexations = 1; $stats->create(); } else { $last_index[0]->indexations += 1; $last_index[0]->update(); } return array('total' => $total, 'new' => $new_words, 'unique' => $n); }
/** * Index a document. * * The document must provide a method _toIndex() returning the * document as a string for indexation. The string must be clean * and will simply be tokenized by Pluf_Text::tokenize(). * * So a recommended way to clean it at the end is to remove all * the HTML tags and then run the following on it: * * return Pluf_Text::cleanString(html_entity_decode($string, * ENT_QUOTES, 'UTF-8')); * * Indexing is resource intensive so it is recommanded to run the * indexing in an asynchronous way. When you save a resource to be * indexed, just write a log "need to index resource x" and then * you can every few minutes index the resources. Nobody care if * your index is not perfectly fresh, but your end users care if * it takes 0.6s to get back the page instead of 0.1s. * * Take 500 average documents, index them while counting the total * time it takes to index. Divide by 500 and if the result is more * than 0.1s, use a log/queue. * * FIXME: Concurrency problem if you index at the same time the same doc. * * @param Pluf_Model Document to index. * @param Stemmer used. ('Pluf_Text_Stemmer_Porter') * @return array Statistics. */ public static function index($doc, $stemmer = 'Pluf_Text_Stemmer_Porter') { $words = Pluf_Text::tokenize($doc->_toIndex()); if ($stemmer != null) { $words = self::stem($words, $stemmer); } // Get the total number of words. $total = 0.0; $words_flat = array(); foreach ($words as $word => $occ) { $total += (double) $occ; $words_flat[] = $word; } // Drop the last indexation. $gocc = new Pluf_Search_Occ(); $sql = new Pluf_SQL('DELETE FROM ' . $gocc->getSqlTable() . ' WHERE model_class=%s AND model_id=%s', array($doc->_model, $doc->id)); $db =& Pluf::db(); $db->execute($sql->gen()); // Get the ids for each word. $ids = self::getWordIds($words_flat); // Insert a new word for the missing words and add the occ. $n = count($ids); $new_words = 0; $done = array(); for ($i = 0; $i < $n; $i++) { if ($ids[$i] === null) { $word = new Pluf_Search_Word(); $word->word = $words_flat[$i]; try { $word->create(); $ids[$i] = $word->id; } catch (Exception $e) { // most likely concurrent addition of a word, try // to read it. $_ids = self::getWordIds(array($words_flat[$i])); if ($_ids[0] !== null) { // if we miss it here, just forget about it $ids[$i] = $_ids[0]; } } $new_words++; } if (isset($done[$ids[$i]])) { continue; } $done[$ids[$i]] = true; $occ = new Pluf_Search_Occ(); $occ->word = new Pluf_Search_Word($ids[$i]); $occ->model_class = $doc->_model; $occ->model_id = $doc->id; $occ->occ = $words[$words_flat[$i]]; $occ->pondocc = $words[$words_flat[$i]] / $total; $occ->create(); } // update the stats $sql = new Pluf_SQL('model_class=%s AND model_id=%s', array($doc->_model, $doc->id)); $last_index = Pluf::factory('Pluf_Search_Stats')->getList(array('filter' => $sql->gen())); if ($last_index->count() == 0) { $stats = new Pluf_Search_Stats(); $stats->model_class = $doc->_model; $stats->model_id = $doc->id; $stats->indexations = 1; $stats->create(); } else { $last_index[0]->indexations += 1; $last_index[0]->update(); } return array('total' => $total, 'new' => $new_words, 'unique' => $n); }