function start($text, $request, $echo = true, $wordwrap = true, $esc = true, $autolink = true, $nl2br = false) { $this->project = $request->project; $this->request = $request; $this->scm = IDF_Scm::get($request->project); if ($esc) { $text = Pluf_esc($text); } if ($autolink) { $text = preg_replace('#([a-z]+://[^\\s\\(\\)]+)#i', '<a href="\\1">\\1</a>', $text); } if ($request->rights['hasIssuesAccess']) { $text = preg_replace_callback('#((?:issue|bug|ticket)(s)?\\s+|\\s+\\#)(\\d+)(\\#ic\\d+)?(?(2)((?:[, \\w]+(?:\\s+\\#)?)?\\d+(?:\\#ic\\d+)?){0,})#im', array($this, 'callbackIssues'), $text); } if ($request->rights['hasReviewAccess']) { $text = preg_replace_callback('#(reviews?\\s+)(\\d+(?:(?:\\s+and|\\s+or|,)\\s+\\d+)*)\\b#i', array($this, 'callbackReviews'), $text); } if ($request->rights['hasSourceAccess']) { $text = preg_replace_callback('#(commits?\\s+)([0-9a-f]{1,40}(?:(?:\\s+and|\\s+or|,)\\s+[0-9a-f]{1,40})*)\\b#i', array($this, 'callbackCommits'), $text); $text = preg_replace_callback('#(src:)([^\\s\\(\\)\\\\]+(?:(\\\\)\\s+[^\\s\\(\\)\\\\]+){0,})+#im', array($this, 'callbackSource'), $text); } if ($wordwrap) { $text = Pluf_Text::wrapHtml($text, 69, "\n"); } if ($nl2br) { $text = nl2br($text); } if ($echo) { echo $text; } else { return $text; } }
function _toIndex() { $r = array(); foreach ($this->get_comments_list() as $c) { $r[] = $c->_toIndex(); } $str = str_repeat($this->summary . ' ', 4) . ' ' . implode(' ', $r); return Pluf_Text::cleanString(html_entity_decode($str, ENT_QUOTES, 'UTF-8')); }
function start($text, $request, $echo = true, $wordwrap = true, $esc = true, $autolink = true, $nl2br = false) { $this->project = $request->project; $this->request = $request; $this->scm = IDF_Scm::get($request->project); if ($esc) { $text = Pluf_esc($text); } if ($autolink) { $text = preg_replace('#([a-z]+://[^\\s\\(\\)]+)#i', '<a href="\\1">\\1</a>', $text); } if ($request->rights['hasIssuesAccess']) { $text = preg_replace_callback('#((?:issue|bug|ticket)(s)?\\s+|\\s+\\#)(\\d+)(\\#ic\\d+)?(?(2)((?:[, \\w]+(?:\\s+\\#)?)?\\d+(?:\\#ic\\d+)?){0,})#im', array($this, 'callbackIssues'), $text); } if ($request->rights['hasReviewAccess']) { $text = preg_replace_callback('#(reviews?\\s+)(\\d+(?:(?:\\s+and|\\s+or|,)\\s+\\d+)*)\\b#i', array($this, 'callbackReviews'), $text); } if ($request->rights['hasSourceAccess']) { $verbs = array('added', 'fixed', 'reverted', 'changed', 'removed'); $nouns = array('commit', 'commits', 'revision', 'revisions', 'rev', 'revs'); $prefix = implode(' in|', $verbs) . ' in' . '|' . implode('|', $nouns); $text = preg_replace_callback('#((?:' . $prefix . ')(?:\\s+r?))([0-9a-f]{1,40}((?:\\s+and|\\s+or|,)\\s+r?[0-9a-f]{1,40})*)\\b#i', array($this, 'callbackCommits'), $text); $text = preg_replace_callback('=(src:)([^\\s@#,\\(\\)\\\\]+(?:(\\\\)[\\s@#][^\\s@#,\\(\\)\\\\]+){0,})+(?:\\@([^\\s#,]+))(?:#(\\d+))?=im', array($this, 'callbackSource'), $text); } if ($wordwrap) { $text = Pluf_Text::wrapHtml($text, 69, "\n"); } if ($nl2br) { $text = nl2br($text); } if ($echo) { echo $text; } else { return $text; } }
function _toIndex() { $str = str_repeat($this->summary . ' ', 4) . ' ' . $this->fullmessage; return Pluf_Text::cleanString(html_entity_decode($str, ENT_QUOTES, 'UTF-8')); }
/** * Index a document. * * See Pluf_Search for the disclaimer and informations. * * @param Pluf_Model Document to index. * @param Stemmer used. ('Pluf_Text_Stemmer_Porter') * @return array Statistics. */ public static function index($doc, $stemmer = 'Pluf_Text_Stemmer_Porter') { $words = Pluf_Text::tokenize($doc->_toIndex()); if ($stemmer != null) { $words = self::stem($words, $stemmer); } // Get the total number of words. $total = 0.0; $words_flat = array(); foreach ($words as $word => $occ) { $total += (double) $occ; $words_flat[] = $word; } // Drop the last indexation. $gocc = new IDF_Search_Occ(); $sql = new Pluf_SQL('DELETE FROM ' . $gocc->getSqlTable() . ' WHERE model_class=%s AND model_id=%s', array($doc->_model, $doc->id)); $db =& Pluf::db(); $db->execute($sql->gen()); // Get the ids for each word. $ids = self::getWordIds($words_flat); // Insert a new word for the missing words and add the occ. $n = count($ids); $new_words = 0; $done = array(); for ($i = 0; $i < $n; $i++) { if ($ids[$i] === null) { $word = new Pluf_Search_Word(); $word->word = $words_flat[$i]; try { $word->create(); $new_words++; $ids[$i] = $word->id; } catch (Exception $e) { // 100% of the time, the word has been created // by another process in the background. $r_ids = self::getWordIds(array($word->word)); if ($r_ids[0]) { $ids[$i] = $r_ids[0]; } else { // give up for this word continue; } } } if (isset($done[$ids[$i]])) { continue; } $done[$ids[$i]] = true; $occ = new IDF_Search_Occ(); $occ->word = new Pluf_Search_Word($ids[$i]); $occ->model_class = $doc->_model; $occ->model_id = $doc->id; $occ->project = $doc->get_project(); $occ->occ = $words[$words_flat[$i]]; $occ->pondocc = $words[$words_flat[$i]] / $total; $occ->create(); } // update the stats $sql = new Pluf_SQL('model_class=%s AND model_id=%s', array($doc->_model, $doc->id)); $last_index = Pluf::factory('Pluf_Search_Stats')->getList(array('filter' => $sql->gen())); if ($last_index->count() == 0) { $stats = new Pluf_Search_Stats(); $stats->model_class = $doc->_model; $stats->model_id = $doc->id; $stats->indexations = 1; $stats->create(); } else { $last_index[0]->indexations += 1; $last_index[0]->update(); } return array('total' => $total, 'new' => $new_words, 'unique' => $n); }
/** * Index a document. * * The document must provide a method _toIndex() returning the * document as a string for indexation. The string must be clean * and will simply be tokenized by Pluf_Text::tokenize(). * * So a recommended way to clean it at the end is to remove all * the HTML tags and then run the following on it: * * return Pluf_Text::cleanString(html_entity_decode($string, * ENT_QUOTES, 'UTF-8')); * * Indexing is resource intensive so it is recommanded to run the * indexing in an asynchronous way. When you save a resource to be * indexed, just write a log "need to index resource x" and then * you can every few minutes index the resources. Nobody care if * your index is not perfectly fresh, but your end users care if * it takes 0.6s to get back the page instead of 0.1s. * * Take 500 average documents, index them while counting the total * time it takes to index. Divide by 500 and if the result is more * than 0.1s, use a log/queue. * * FIXME: Concurrency problem if you index at the same time the same doc. * * @param Pluf_Model Document to index. * @param Stemmer used. ('Pluf_Text_Stemmer_Porter') * @return array Statistics. */ public static function index($doc, $stemmer = 'Pluf_Text_Stemmer_Porter') { $words = Pluf_Text::tokenize($doc->_toIndex()); if ($stemmer != null) { $words = self::stem($words, $stemmer); } // Get the total number of words. $total = 0.0; $words_flat = array(); foreach ($words as $word => $occ) { $total += (double) $occ; $words_flat[] = $word; } // Drop the last indexation. $gocc = new Pluf_Search_Occ(); $sql = new Pluf_SQL('DELETE FROM ' . $gocc->getSqlTable() . ' WHERE model_class=%s AND model_id=%s', array($doc->_model, $doc->id)); $db =& Pluf::db(); $db->execute($sql->gen()); // Get the ids for each word. $ids = self::getWordIds($words_flat); // Insert a new word for the missing words and add the occ. $n = count($ids); $new_words = 0; $done = array(); for ($i = 0; $i < $n; $i++) { if ($ids[$i] === null) { $word = new Pluf_Search_Word(); $word->word = $words_flat[$i]; try { $word->create(); $ids[$i] = $word->id; } catch (Exception $e) { // most likely concurrent addition of a word, try // to read it. $_ids = self::getWordIds(array($words_flat[$i])); if ($_ids[0] !== null) { // if we miss it here, just forget about it $ids[$i] = $_ids[0]; } } $new_words++; } if (isset($done[$ids[$i]])) { continue; } $done[$ids[$i]] = true; $occ = new Pluf_Search_Occ(); $occ->word = new Pluf_Search_Word($ids[$i]); $occ->model_class = $doc->_model; $occ->model_id = $doc->id; $occ->occ = $words[$words_flat[$i]]; $occ->pondocc = $words[$words_flat[$i]] / $total; $occ->create(); } // update the stats $sql = new Pluf_SQL('model_class=%s AND model_id=%s', array($doc->_model, $doc->id)); $last_index = Pluf::factory('Pluf_Search_Stats')->getList(array('filter' => $sql->gen())); if ($last_index->count() == 0) { $stats = new Pluf_Search_Stats(); $stats->model_class = $doc->_model; $stats->model_id = $doc->id; $stats->indexations = 1; $stats->create(); } else { $last_index[0]->indexations += 1; $last_index[0]->update(); } return array('total' => $total, 'new' => $new_words, 'unique' => $n); }
/** * Returns the n-grams of rank n of the word. * * @param string Word. * @return array N-grams */ public static function makeNgrams($word, $n = 3) { $chars = array('_'); $chars = $chars + Pluf_Text::stringToChars($word); $chars[] = '_'; $l = count($chars); $ngrams = array(); for ($i = 0; $i < $l + 1 - $n; $i++) { $ngrams[$i] = array(); } $n_ngrams = $l + 1 - $n; for ($i = 0; $i < $l; $i++) { for ($j = 0; $j < $n; $j++) { if (isset($ngrams[$i - $j])) { $ngrams[$i - $j][] = $chars[$i]; } } } $out = array(); foreach ($ngrams as $ngram) { $t = implode('', $ngram); if ($t != '__') { $out[] = $t; } } return $out; }
function _toIndex() { $rev = $this->get_current_revision()->_toIndex(); $str = str_repeat($this->title . ' ' . $this->summary . ' ', 4) . ' ' . $rev; return Pluf_Text::cleanString(html_entity_decode($str, ENT_QUOTES, 'UTF-8')); }