コード例 #1
0
ファイル: Phrase.php プロジェクト: hackingman/TubeX
 public function getQuery($encoding)
 {
     if (strpos($this->_phrase, '?') !== false || strpos($this->_phrase, '*') !== false) {
         require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
         throw new Zend_Search_Lucene_Search_QueryParserException('Wildcards are only allowed in a single terms.');
     }
     $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_phrase, $encoding);
     if (count($tokens) == 0) {
         return new Zend_Search_Lucene_Search_Query_Insignificant();
     }
     if (count($tokens) == 1) {
         $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
         $query = new Zend_Search_Lucene_Search_Query_Term($term);
         $query->setBoost($this->_boost);
         return $query;
     }
     //It's not empty or one term query
     $position = -1;
     $query = new Zend_Search_Lucene_Search_Query_Phrase();
     foreach ($tokens as $token) {
         $position += $token->getPositionIncrement();
         $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field);
         $query->addTerm($term, $position);
     }
     if ($this->_proximityQuery) {
         $query->setSlop($this->_wordsDistance);
     }
     $query->setBoost($this->_boost);
     return $query;
 }
コード例 #2
0
 /**
  * Adds a document to this segment.
  *
  * @param Zend_Search_Lucene_Document $document
  * @throws Zend_Search_Lucene_Exception
  */
 public function addDocument(Zend_Search_Lucene_Document $document)
 {
     $storedFields = array();
     $docNorms = array();
     $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
     foreach ($document->getFieldNames() as $fieldName) {
         $field = $document->getField($fieldName);
         $this->addField($field);
         if ($field->storeTermVector) {
             /**
              * @todo term vector storing support
              */
             throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
         }
         if ($field->isIndexed) {
             if ($field->isTokenized) {
                 $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue);
             } else {
                 $tokenList = array();
                 $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue));
             }
             $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, count($tokenList))));
             $position = 0;
             foreach ($tokenList as $token) {
                 $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
                 $termKey = $term->key();
                 if (!isset($this->_termDictionary[$termKey])) {
                     // New term
                     $this->_termDictionary[$termKey] = $term;
                     $this->_termDocs[$termKey] = array();
                     $this->_termDocs[$termKey][$this->_docCount] = array();
                 } else {
                     if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
                         // Existing term, but new term entry
                         $this->_termDocs[$termKey][$this->_docCount] = array();
                     }
                 }
                 $position += $token->getPositionIncrement();
                 $this->_termDocs[$termKey][$this->_docCount][] = $position;
             }
         }
         if ($field->isStored) {
             $storedFields[] = $field;
         }
     }
     foreach ($this->_fields as $fieldName => $field) {
         if (!$field->isIndexed) {
             continue;
         }
         if (!isset($this->_norms[$fieldName])) {
             $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))), $this->_docCount);
         }
         if (isset($docNorms[$fieldName])) {
             $this->_norms[$fieldName] .= $docNorms[$fieldName];
         } else {
             $this->_norms[$fieldName] .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0)));
         }
     }
     $this->addStoredFields($storedFields);
 }
コード例 #3
0
 public function testAnalyzer()
 {
     $currentAnalyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
     $this->assertTrue($currentAnalyzer instanceof Zend_Search_Lucene_Analysis_Analyzer);
     $newAnalyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num();
     Zend_Search_Lucene_Analysis_Analyzer::setDefault($newAnalyzer);
     $this->assertTrue(Zend_Search_Lucene_Analysis_Analyzer::getDefault() === $newAnalyzer);
     // Set analyzer to the default value (used in other tests)
     Zend_Search_Lucene_Analysis_Analyzer::setDefault($currentAnalyzer);
 }
コード例 #4
0
 public function analyze($text)
 {
     $result = parent::analyze($text);
     sfOpenPNEApplicationConfiguration::registerZend();
     Zend_Search_Lucene_Analysis_Analyzer::setDefault(new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8());
     $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
     $analyzer->setInput($text, 'UTF-8');
     while (($nextToken = $analyzer->nextToken()) !== null) {
         $result[] = $nextToken->getTermText();
     }
     return $result;
 }
  private static function prepareZendSearchLucene()
  {
    Zend_Search_Lucene_Analysis_Analyzer::setDefault(new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive());

    $stopWords = sfConfig::get('app_sf_propel_luceneable_behavior_stopWords', false);
    $stopWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_StopWords(false === $stopWords ? array() : explode(',', $stopWords));
    Zend_Search_Lucene_Analysis_Analyzer::getDefault()->addFilter($stopWordsFilter);

    $shortWords = sfConfig::get('app_sf_propel_luceneable_behavior_shortWords', 3);
    $shortWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_ShortWords($shortWords);
    Zend_Search_Lucene_Analysis_Analyzer::getDefault()->addFilter($shortWordsFilter);

    Zend_Search_Lucene_Storage_Directory_Filesystem::setDefaultFilePermissions(0777);
  }
コード例 #6
0
 public function testFilteredTokensQueryParserProcessing()
 {
     $index = Zend_Search_Lucene::open(dirname(__FILE__) . '/_index23Sample/_files');
     $this->assertEquals(count(Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize('123456787654321')), 0);
     $hits = $index->find('"PEAR developers" AND Home AND 123456787654321');
     $this->assertEquals(count($hits), 1);
     $expectedResultset = array(array(1, 0.16827, 'IndexSource/contributing.wishlist.html'));
     foreach ($hits as $resId => $hit) {
         $this->assertEquals($hit->id, $expectedResultset[$resId][0]);
         $this->assertTrue(abs($hit->score - $expectedResultset[$resId][1]) < 1.0E-6);
         $this->assertEquals($hit->path, $expectedResultset[$resId][2]);
     }
 }
コード例 #7
0
ファイル: Html.php プロジェクト: bartolomeu/estoque_gusella
 /**
  * Highlight text using specified View helper or callback function.
  *
  * @param string|array $words  Words to highlight. Words could be organized using the array or string.
  * @param callback $callback   Callback method, used to transform (highlighting) text.
  * @param array    $params     Array of additionall callback parameters passed through into it
  *                             (first non-optional parameter is an HTML fragment for highlighting)
  * @return string
  * @throws Zend_Search_Lucene_Exception
  */
 public function highlightExtended($words, $callback, $params = array())
 {
     /** Zend_Search_Lucene_Analysis_Analyzer */
     require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
     if (!is_array($words)) {
         $words = array($words);
     }
     $wordsToHighlightList = array();
     $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
     foreach ($words as $wordString) {
         $wordsToHighlightList[] = $analyzer->tokenize($wordString);
     }
     $wordsToHighlight = call_user_func_array('array_merge', $wordsToHighlightList);
     if (count($wordsToHighlight) == 0) {
         return $this->_doc->saveHTML();
     }
     $wordsToHighlightFlipped = array();
     foreach ($wordsToHighlight as $id => $token) {
         $wordsToHighlightFlipped[$token->getTermText()] = $id;
     }
     if (!is_callable($callback)) {
         require_once 'Zend/Search/Lucene/Exception.php';
         throw new Zend_Search_Lucene_Exception('$viewHelper parameter must be a View Helper name, View Helper object or callback.');
     }
     $xpath = new DOMXPath($this->_doc);
     $matchedNodes = $xpath->query("/html/body");
     foreach ($matchedNodes as $matchedNode) {
         $this->_highlightNodeRecursive($matchedNode, $wordsToHighlightFlipped, $callback, $params);
     }
 }
コード例 #8
0
ファイル: Range.php プロジェクト: ismaelmelus/home
 /**
  * Highlight query terms
  *
  * @param integer &$colorIndex
  * @param Zend_Search_Lucene_Document_Html $doc
  */
 public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
 {
     /** @todo implementation */
     $words = array();
     $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*'), preg_quote($this->_pattern->text, '/')) . '$/';
     if (@preg_match('/\\pL/u', 'a') == 1) {
         // PCRE unicode support is turned on
         // add Unicode modifier to the match expression
         $matchExpression .= 'u';
     }
     $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($doc->getFieldUtf8Value('body'), 'UTF-8');
     foreach ($tokens as $token) {
         if (preg_match($matchExpression, $token->getTermText()) === 1) {
             $words[] = $token->getTermText();
         }
     }
     $doc->highlight($words, $this->_getHighlightColor($colorIndex));
 }
コード例 #9
0
ファイル: Phrase.php プロジェクト: Simarpreet05/joomla
 /**
  * Query specific matches highlighting
  *
  * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter  Highlighter object (also contains doc for highlighting)
  */
 protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
 {
     /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
     /** Skip exact term matching recognition, keyword fields highlighting is not supported */
     /** Skip wildcard queries recognition. Supported wildcards are removed by text analyzer */
     // tokenize phrase using current analyzer and process it as a phrase query
     require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
     $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding);
     if (count($tokens) == 0) {
         // Do nothing
         return;
     }
     if (count($tokens) == 1) {
         $highlighter->highlight($tokens[0]->getTermText());
         return;
     }
     //It's non-trivial phrase query
     $words = array();
     foreach ($tokens as $token) {
         $words[] = $token->getTermText();
     }
     $highlighter->highlight($words);
 }
コード例 #10
0
ファイル: Fuzzy.php プロジェクト: c12g/stratos-php
 /**
  * Query specific matches highlighting
  *
  * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter  Highlighter object (also contains doc for highlighting)
  */
 protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
 {
     /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
     /** Skip exact term matching recognition, keyword fields highlighting is not supported */
     // -------------------------------------
     // Recognize wildcard queries
     /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
     if (@preg_match('/\\pL/u', 'a') == 1) {
         $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));
     } else {
         $subPatterns = preg_split('/[*?]/', $this->_word);
     }
     if (count($subPatterns) > 1) {
         // Do nothing
         return;
     }
     // -------------------------------------
     // Recognize one-term multi-term and "insignificant" queries
     require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
     $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
     if (count($tokens) == 0) {
         // Do nothing
         return;
     }
     if (count($tokens) == 1) {
         require_once 'Zend/Search/Lucene/Index/Term.php';
         $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
         require_once 'Zend/Search/Lucene/Search/Query/Fuzzy.php';
         $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);
         $query->_highlightMatches($highlighter);
         return;
     }
     // Word is tokenized into several tokens
     // But fuzzy search is supported only for non-multiple word terms
     // Do nothing
 }
コード例 #11
0
$t->isa_ok($indexer, 'sfLuceneIndexerFactory', '->getIndexer() returns an instance of sfLuceneIndexerFactory');
$t->diag('testing ->getContext()');
$t->isa_ok($lucene->getContext(), 'sfContext', '->getContext() returns an instance of sfContext');
$t->is($lucene->getContext(), sfContext::getInstance(), '->getContext() returns the same context');
$t->diag('testing ->configure()');
$lucene->configure();
$t->is(Zend_Search_Lucene_Search_QueryParser::getDefaultEncoding(), 'UTF-8', '->configure() configures the query parsers encoding');
foreach (array('Text', 'TextNum', 'Utf8', 'Utf8Num') as $type) {
    $lucene->setParameter('analyzer', $type);
    $lucene->configure();
    $class = 'Zend_Search_Lucene_Analysis_Analyzer_Common_' . $type;
    $expected = new $class();
    $expected->addFilter(new sfLuceneLowerCaseFilter(true));
    $expected->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_StopWords(array('and', 'the')));
    $expected->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_ShortWords(2));
    $actual = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
    $t->ok($actual == $expected, '->configure() configures the analyzer for ' . $type);
}
$lucene->setParameter('analyzer', 'foobar');
try {
    $lucene->configure();
    $t->fail('->configure() analyzer must be of text, textnum, utf8, or utf8num');
} catch (Exception $e) {
    $t->pass('->configure() analyzer must be of text, textnum, utf8, or utf8num');
}
$lucene->setParameter('analyzer', 'utf8num');
$t->diag('testing ->find()');
class MockLucene
{
    public $args;
    public $scoring;
コード例 #12
0
ファイル: Term.php プロジェクト: be-dmitry/zf1
 /**
  * Query specific matches highlighting
  *
  * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter  Highlighter object (also contains doc for highlighting)
  */
 protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
 {
     /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
     /** Skip exact term matching recognition, keyword fields highlighting is not supported */
     // -------------------------------------
     // Recognize wildcard queries
     /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
     if (@preg_match('/\\pL/u', 'a') == 1) {
         $word = iconv($this->_encoding, 'UTF-8', $this->_word);
         $wildcardsPattern = '/[*?]/u';
         $subPatternsEncoding = 'UTF-8';
     } else {
         $word = $this->_word;
         $wildcardsPattern = '/[*?]/';
         $subPatternsEncoding = $this->_encoding;
     }
     $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);
     if (count($subPatterns) > 1) {
         // Wildcard query is recognized
         $pattern = '';
         require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
         foreach ($subPatterns as $id => $subPattern) {
             // Append corresponding wildcard character to the pattern before each sub-pattern (except first)
             if ($id != 0) {
                 $pattern .= $word[$subPattern[1] - 1];
             }
             // Check if each subputtern is a single word in terms of current analyzer
             $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);
             if (count($tokens) > 1) {
                 // Do nothing (nothing is highlighted)
                 return;
             }
             foreach ($tokens as $token) {
                 $pattern .= $token->getTermText();
             }
         }
         require_once 'Zend/Search/Lucene/Index/Term.php';
         $term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field);
         require_once 'Zend/Search/Lucene/Search/Query/Wildcard.php';
         $query = new Zend_Search_Lucene_Search_Query_Wildcard($term);
         $query->_highlightMatches($highlighter);
         return;
     }
     // -------------------------------------
     // Recognize one-term multi-term and "insignificant" queries
     require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
     $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
     if (count($tokens) == 0) {
         // Do nothing
         return;
     }
     if (count($tokens) == 1) {
         $highlighter->highlight($tokens[0]->getTermText());
         return;
     }
     //It's not insignificant or one term query
     $words = array();
     foreach ($tokens as $token) {
         $words[] = $token->getTermText();
     }
     $highlighter->highlight($words);
 }
コード例 #13
0
ファイル: Range.php プロジェクト: alefernie/intranet
 /**
  * Query specific matches highlighting
  *
  * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter  Highlighter object (also contains doc for highlighting)
  */
 protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
 {
     $words = array();
     $docBody = $highlighter->getDocument()->getFieldUtf8Value('body');
     // require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
     $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($docBody, 'UTF-8');
     $lowerTermText = $this->_lowerTerm !== null ? $this->_lowerTerm->text : null;
     $upperTermText = $this->_upperTerm !== null ? $this->_upperTerm->text : null;
     if ($this->_inclusive) {
         foreach ($tokens as $token) {
             $termText = $token->getTermText();
             if (($lowerTermText == null || $lowerTermText <= $termText) && ($upperTermText == null || $termText <= $upperTermText)) {
                 $words[] = $termText;
             }
         }
     } else {
         foreach ($tokens as $token) {
             $termText = $token->getTermText();
             if (($lowerTermText == null || $lowerTermText < $termText) && ($upperTermText == null || $termText < $upperTermText)) {
                 $words[] = $termText;
             }
         }
     }
     $highlighter->highlight($words);
 }
コード例 #14
0
 /**
  * Return a list of posts that are similar to the current post.
  * This is not a very good implementation, so do not expect 
  * amazing results - the term vector is not available for a doc
  * in ZSL, which limits how far you can go!
  *
  * @return array ids 
  */
 public function get_similar_posts($post, $max_recommended = 5)
 {
     Zend_Search_Lucene::setResultSetLimit($max_recommended + 1);
     $title = $post->title;
     $tags = $post->tags;
     $tagstring = '';
     foreach ($tags as $tag) {
         $tagstring .= $tag . ' ';
     }
     $analyser = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
     $tokens = $analyser->tokenize(strtolower($tagstring) . ' ' . strtolower($title));
     $query = new Zend_Search_Lucene_Search_Query_MultiTerm();
     foreach ($tokens as $token) {
         $query->addTerm(new Zend_Search_Lucene_Index_Term($token->getTermText()));
     }
     $hits = $this->_index->find($query);
     $ids = array();
     $counter = 0;
     foreach ($hits as $hit) {
         if ($hit->postid != $post->id) {
             $ids[] = $hit->postid;
             $counter++;
         }
         if ($counter == $max_recommended) {
             break;
         }
     }
     return $ids;
 }
コード例 #15
0
ファイル: Lucene.php プロジェクト: rommmka/axiscommerce
 /**
  *
  * @return Zend_Search_Lucene_Search_Query $query
  * @param string $queryString
  */
 public function createFuzzyQuery($queryString)
 {
     Zend_Search_Lucene_Search_QueryParser::setDefaultEncoding($this->_encoding);
     $userQuery = Zend_Search_Lucene_Search_QueryParser::parse($queryString, $this->_encoding);
     $query = new Zend_Search_Lucene_Search_Query_Boolean();
     $query->addSubquery($userQuery);
     $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($queryString, $this->_encoding);
     if (2 > count($tokens)) {
         $term = new Zend_Search_Lucene_Index_Term($queryString, 'name');
         $fuzzy = new Zend_Search_Lucene_Search_Query_Fuzzy($term, 0.4);
         $query->addSubquery($fuzzy);
     }
     return $query;
 }
コード例 #16
0
 /**
  * Adds a document to this segment.
  *
  * @param Zend_Search_Lucene_Document $document
  * @throws Zend_Search_Lucene_Exception
  */
 public function addDocument(Zend_Search_Lucene_Document $document)
 {
     $storedFields = array();
     foreach ($document->getFieldNames() as $fieldName) {
         $field = $document->getField($fieldName);
         $this->_addFieldInfo($field);
         if ($field->storeTermVector) {
             /**
              * @todo term vector storing support
              */
             throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
         }
         if ($field->isIndexed) {
             if ($field->isTokenized) {
                 $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue);
             } else {
                 $tokenList = array();
                 $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue));
             }
             $this->_fieldLengths[$field->name][$this->_docCount] = count($tokenList);
             $position = 0;
             foreach ($tokenList as $token) {
                 $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
                 $termKey = $term->key();
                 if (!isset($this->_termDictionary[$termKey])) {
                     // New term
                     $this->_termDictionary[$termKey] = $term;
                     $this->_termDocs[$termKey] = array();
                     $this->_termDocs[$termKey][$this->_docCount] = array();
                 } else {
                     if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
                         // Existing term, but new term entry
                         $this->_termDocs[$termKey][$this->_docCount] = array();
                     }
                 }
                 $position += $token->getPositionIncrement();
                 $this->_termDocs[$termKey][$this->_docCount][] = $position;
             }
         }
         if ($field->isStored) {
             $storedFields[] = $field;
         }
     }
     if (count($storedFields) != 0) {
         if (!isset($this->_fdxFile)) {
             $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
             $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
             $this->_files[] = $this->_name . '.fdx';
             $this->_files[] = $this->_name . '.fdt';
         }
         $this->_fdxFile->writeLong($this->_fdtFile->tell());
         $this->_fdtFile->writeVInt(count($storedFields));
         foreach ($storedFields as $field) {
             $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
             $fieldBits = ($field->isTokenized ? 0x1 : 0x0) | ($field->isBinary ? 0x2 : 0x0) | 0x0;
             /* 0x04 - third bit, compressed (ZLIB) */
             $this->_fdtFile->writeByte($fieldBits);
             if ($field->isBinary) {
                 $this->_fdtFile->writeVInt(strlen($field->stringValue));
                 $this->_fdtFile->writeBytes($field->stringValue);
             } else {
                 $this->_fdtFile->writeString($field->stringValue);
             }
         }
     }
     $this->_docCount++;
 }
コード例 #17
0
 /**
  * Adds a document to this index.
  *
  * @param Zend_Search_Lucene_Document $document
  */
 public function addDocument(Zend_Search_Lucene_Document $document)
 {
     $this->_documents[$this->_docID] = $document;
     // parse document
     $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
     $fieldNames = $document->getFieldnames();
     foreach ($fieldNames as $fieldName) {
         $field = $document->getField($fieldName);
         // tokenize if requested
         if ($field->isTokenized) {
             $tokens = $analyzer->tokenize($field->getUtf8Value(), 'UTF-8');
         } else {
             $tokens = array(new Zend_Search_Lucene_Analysis_Token($field->getUtf8Value(), 0, strlen(utf8_decode($field->getUtf8Value()))));
         }
         // store tokens in "index"
         $position = -1;
         foreach ($tokens as $token) {
             $text = $token->getTermText();
             $term = new Zend_Search_Lucene_Index_Term($text, $fieldName);
             $position += $token->getPositionIncrement();
             // build an ordered array (list) of terms for each field
             if (isset($this->_terms[$fieldName])) {
                 // if the term is not set already, sort it in
                 if (!isset($this->_terms[$fieldName][$text])) {
                     $new = array();
                     while (($current = array_shift($this->_terms[$fieldName])) && $text > $current->text) {
                         $new[$current->text] = $current;
                     }
                     $new[$text] = $term;
                     if ($current) {
                         $new[$current->text] = $current;
                     }
                     $this->_terms[$fieldName] = array_merge($new, $this->_terms[$fieldName]);
                 }
             } else {
                 // first terms in each field are just stored
                 $this->_terms[$fieldName][$text] = $term;
             }
             // store termPosition for this term
             $this->_termPositions[$fieldName][$text][$this->_docID][] = $position;
             // store or increase term freq for this document
             if (!isset($this->_termDocs[$fieldName][$text][$this->_docID])) {
                 $this->_termDocs[$fieldName][$text][$this->_docID] = 1;
             } else {
                 $this->_termDocs[$fieldName][$text][$this->_docID]++;
             }
         }
         // remember fieldname and document
         $this->_fields[$fieldName][$this->_docID] = 1;
         // calculate and store normalisation vector
         $this->_norms[$fieldName][$this->_docID] = $this->getSimilarity()->lengthNorm($fieldName, sizeof($tokens)) * $document->boost * $field->boost;
     }
     // increase docID
     $this->_docID++;
 }
コード例 #18
0
 /**
  * Process last range query term (closed interval)
  *
  * @throws Zend_Search_Lucene_Search_QueryParserException
  */
 public function closedRQLastTerm()
 {
     $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_rqFirstTerm, $this->_encoding);
     if (count($tokens) > 1) {
         require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
         throw new Zend_Search_Lucene_Search_QueryParserException('Range query boundary terms must be non-multiple word terms');
     } else {
         if (count($tokens) == 1) {
             require_once 'Zend/Search/Lucene/Index/Term.php';
             $from = new Zend_Search_Lucene_Index_Term(reset($tokens)->getTermText(), $this->_context->getField());
         } else {
             $from = null;
         }
     }
     $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_currentToken->text, $this->_encoding);
     if (count($tokens) > 1) {
         require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
         throw new Zend_Search_Lucene_Search_QueryParserException('Range query boundary terms must be non-multiple word terms');
     } else {
         if (count($tokens) == 1) {
             require_once 'Zend/Search/Lucene/Index/Term.php';
             $to = new Zend_Search_Lucene_Index_Term(reset($tokens)->getTermText(), $this->_context->getField());
         } else {
             $to = null;
         }
     }
     if ($from === null && $to === null) {
         require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
         throw new Zend_Search_Lucene_Search_QueryParserException('At least one range query boundary term must be non-empty term');
     }
     require_once 'Zend/Search/Lucene/Search/Query/Range.php';
     $rangeQuery = new Zend_Search_Lucene_Search_Query_Range($from, $to, true);
     require_once 'Zend/Search/Lucene/Search/QueryEntry/Subquery.php';
     $entry = new Zend_Search_Lucene_Search_QueryEntry_Subquery($rangeQuery);
     $this->_context->addEntry($entry);
 }
コード例 #19
0
ファイル: Term.php プロジェクト: ookwudili/chisimba
 /**
  * Transform entry to a subquery
  *
  * @param string $encoding
  * @return Zend_Search_Lucene_Search_Query
  * @throws Zend_Search_Lucene_Search_QueryParserException
  */
 public function getQuery($encoding)
 {
     if ($this->_fuzzyQuery) {
         throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is not supported yet.');
     }
     if (strpos($this->_term, '?') !== false || strpos($this->_term, '*') !== false) {
         throw new Zend_Search_Lucene_Search_QueryParserException('Wildcard queries are not supported yet.');
     }
     $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_term, $encoding);
     if (count($tokens) == 0) {
         return new Zend_Search_Lucene_Search_Query_Insignificant();
     }
     if (count($tokens) == 1) {
         $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
         $query = new Zend_Search_Lucene_Search_Query_Term($term);
         $query->setBoost($this->_boost);
         return $query;
     }
     //It's not empty or one term query
     $query = new Zend_Search_Lucene_Search_Query_MultiTerm();
     /**
      * @todo Process $token->getPositionIncrement() to support stemming, synonyms and other
      * analizer design features
      */
     foreach ($tokens as $token) {
         $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field);
         $query->addTerm($term, true);
         // all subterms are required
     }
     $query->setBoost($this->_boost);
     return $query;
 }
コード例 #20
0
ファイル: LucenePage.class.php プロジェクト: sonicmaster/RPG
    /**
     * @see Page::show()
     */
    public function show()
    {
        parent::show();
        if ($this->action == 'create') {
            $fields = array('messageID' => 'UnIndexed', 'senderID' => 'Keyword', 'sender' => 'Text', 'ownerID' => 'Keyword', 'time' => 'Keyword', 'messageType' => 'Keyword', 'subject' => 'Text', 'message' => 'Text');
            $luceneObj = LuceneEditor::create('messages', $fields);
            var_dump($luceneObj);
        } else {
            if ($this->action == 'count') {
                $luceneObj = $this->getLucene();
                var_dump($luceneObj->count());
            } else {
                if ($this->action == 'search') {
                    $time = microtime(true);
                    $luceneObj = $this->getLucene();
                    $hits = $luceneObj->search($_REQUEST['search']);
                    echo count($hits) . ' in ' . (microtime(true) - $time) . 's:<br>"';
                    foreach ($hits as $hit) {
                        echo $hit->id . '"<br>"';
                        echo $hit->ownerID . '"<br>"';
                        echo $hit->messageID . '"<br>"';
                        echo $hit->time . '"<br>"';
                        echo $hit->subject . '"<br>"';
                        echo $hit->message . '"<br><br>';
                    }
                } else {
                    if ($this->action == 'termDocs') {
                        $time = microtime(true);
                        $luceneObj = $this->getLucene();
                        $ids = $luceneObj->termDocs($_REQUEST['search'], $_REQUEST['f']);
                        echo count($ids) . ' in ' . (microtime(true) - $time) . 's:<br>"';
                        foreach ($ids as $id) {
                            $hit = $luceneObj->getDocument($id);
                            echo $id . '"<br>"';
                            echo $hit->ownerID . '"<br>"';
                            echo $hit->messageID . '"<br>"';
                            echo $hit->time . '"<br>"';
                            echo $hit->subject . '"<br>"';
                            echo $hit->message . '"<br><br>';
                        }
                    } else {
                        if ($this->action == 'test') {
                            var_dump($_REQUEST['search']);
                            $time = microtime(true);
                            $luceneObj = $this->getLucene();
                            $luceneObj->getIndex();
                            $query = Zend_Search_Lucene_Search_QueryParser::parse($_REQUEST['search']);
                            var_dump($query);
                            echo '<br>';
                            echo '<br>';
                            $query = $query->rewrite($luceneObj->getIndex())->optimize($luceneObj->getIndex());
                            var_dump($query);
                            echo '<br>';
                            echo '<br>';
                            //var_dump(Zend_Search_Lucene_Search_QueryParser::$_instance);
                            echo '<br>';
                            echo '<br>';
                            var_dump(Zend_Search_Lucene_Search_QueryParser::$_instance->_context->getQuery());
                            echo '<br>';
                            echo '<br>';
                            /*$expressionRecognizer = new Zend_Search_Lucene_Search_BooleanExpressionRecognizer();
                            		$expressionRecognizer->processLiteral(Zend_Search_Lucene_Search_QueryParser::$_instance->_context->_entries[0]);
                            		$conjuctions = $expressionRecognizer->finishExpression();
                            		var_dump(Zend_Search_Lucene_Search_QueryParser::$_instance->_context->_entries[0], $conjuctions);*/
                            //$query2 = Zend_Search_Lucene_Search_QueryParser::$_instance->_context->_entries[0]->getQuery(null);
                            var_dump($query2);
                            echo '<br>';
                            echo '<br>';
                            $token = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize('1');
                            var_dump($token);
                            echo 'tested in ' . (microtime(true) - $time) . 's<br>';
                        } else {
                            if ($this->action == 'test2') {
                                $time = microtime(true);
                                $luceneObj = $this->getLucene();
                                $lexer = new Zend_Search_Lucene_Search_QueryLexer();
                                var_dump($lexer);
                                echo '<br>';
                                echo '<br>';
                                $tokens = $lexer->tokenize($_REQUEST['search']);
                                var_dump($tokens);
                                echo 'tested in ' . (microtime(true) - $time) . 's<br>';
                            } else {
                                if ($this->action == 'list') {
                                    $time = microtime(true);
                                    $luceneObj = $this->getLucene();
                                    $hits = $luceneObj->terms();
                                    echo 'found in ' . (microtime(true) - $time) . 's:<br>';
                                    var_dump($hits);
                                } else {
                                    if ($this->action == 'optimize') {
                                        $time = microtime(true);
                                        $luceneObj = $this->getLucene();
                                        $luceneObj->optimize();
                                        echo 'optimized in ' . (microtime(true) - $time) . 's<br>';
                                    } else {
                                        if ($this->action == 'process') {
                                            require_once '../wcf/lib/system/io/File.class.php';
                                            include '/tmp/cache.php';
                                            ob_start();
                                            //
                                            //
                                            set_time_limit(120);
                                            $luceneObj = $this->getLucene();
                                            $sql = "SELECT *\r\n\t\t\t\t\tFROM ugml_messages\r\n\t\t\t\t\tWHERE message_id BETWEEN " . $this->start . " AND " . ($this->start + self::INTERVAL);
                                            $result = WCF::getDB()->sendQuery($sql);
                                            $array = array();
                                            $i = 0;
                                            $time = microtime(true);
                                            while ($row = WCF::getDB()->fetchArray($result)) {
                                                $fields = array('messageID' => $row['message_id'], 'senderID' => $row['message_sender'], 'sender' => $row['message_from'], 'ownerID' => $row['message_owner'], 'time' => $row['message_time'], 'messageType' => $row['message_type'], 'subject' => $row['message_subject'], 'message' => $row['message_text']);
                                                $array[] = $fields;
                                                ++$i;
                                            }
                                            if (count($array)) {
                                                $luceneObj->add($array);
                                            } else {
                                                echo 'i think im done :)';
                                            }
                                            echo 'done ' . $i . ' in ' . (microtime(true) - $time) . ';<br>';
                                            $output .= ob_get_contents();
                                            ob_end_clean();
                                            $file = new File('/tmp/cache.php');
                                            $file->write("<?php\r\n/*\r\n  This file is part of WOT Game.\r\n\r\n    WOT Game is free software: you can redistribute it and/or modify\r\n    it under the terms of the GNU Affero General Public License as published by\r\n    the Free Software Foundation, either version 3 of the License, or\r\n    (at your option) any later version.\r\n\r\n    WOT Game is distributed in the hope that it will be useful,\r\n    but WITHOUT ANY WARRANTY; without even the implied warranty of\r\n    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r\n    GNU Affero General Public License for more details.\r\n\r\n    You should have received a copy of the GNU Affero General Public License\r\n    along with WOT Game.  If not, see <http://www.gnu.org/licenses/>.\r\n*/\r\n\n\$output = '" . $output . "';\n?>");
                                            $file->close();
                                            echo $output;
                                            ?>
			<br>
			<br>
			<br>
			<br>
			<br>
			next link:
			<br>
			<a href="index.php?page=Lucene&action=process&luceneID=<?php 
                                            /*
                                              This file is part of WOT Game.
                                            
                                                WOT Game is free software: you can redistribute it and/or modify
                                                it under the terms of the GNU Affero General Public License as published by
                                                the Free Software Foundation, either version 3 of the License, or
                                                (at your option) any later version.
                                            
                                                WOT Game is distributed in the hope that it will be useful,
                                                but WITHOUT ANY WARRANTY; without even the implied warranty of
                                                MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                                                GNU Affero General Public License for more details.
                                            
                                                You should have received a copy of the GNU Affero General Public License
                                                along with WOT Game.  If not, see <http://www.gnu.org/licenses/>.
                                            */
                                            echo $this->luceneID;
                                            ?>
&start=<?php 
                                            /*
                                              This file is part of WOT Game.
                                            
                                                WOT Game is free software: you can redistribute it and/or modify
                                                it under the terms of the GNU Affero General Public License as published by
                                                the Free Software Foundation, either version 3 of the License, or
                                                (at your option) any later version.
                                            
                                                WOT Game is distributed in the hope that it will be useful,
                                                but WITHOUT ANY WARRANTY; without even the implied warranty of
                                                MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                                                GNU Affero General Public License for more details.
                                            
                                                You should have received a copy of the GNU Affero General Public License
                                                along with WOT Game.  If not, see <http://www.gnu.org/licenses/>.
                                            */
                                            echo $this->start + self::INTERVAL;
                                            ?>
">
				index.php?page=Lucene&action=process&luceneID=<?php 
                                            /*
                                              This file is part of WOT Game.
                                            
                                                WOT Game is free software: you can redistribute it and/or modify
                                                it under the terms of the GNU Affero General Public License as published by
                                                the Free Software Foundation, either version 3 of the License, or
                                                (at your option) any later version.
                                            
                                                WOT Game is distributed in the hope that it will be useful,
                                                but WITHOUT ANY WARRANTY; without even the implied warranty of
                                                MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                                                GNU Affero General Public License for more details.
                                            
                                                You should have received a copy of the GNU Affero General Public License
                                                along with WOT Game.  If not, see <http://www.gnu.org/licenses/>.
                                            */
                                            echo $this->luceneID;
                                            ?>
&start=<?php 
                                            /*
                                              This file is part of WOT Game.
                                            
                                                WOT Game is free software: you can redistribute it and/or modify
                                                it under the terms of the GNU Affero General Public License as published by
                                                the Free Software Foundation, either version 3 of the License, or
                                                (at your option) any later version.
                                            
                                                WOT Game is distributed in the hope that it will be useful,
                                                but WITHOUT ANY WARRANTY; without even the implied warranty of
                                                MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                                                GNU Affero General Public License for more details.
                                            
                                                You should have received a copy of the GNU Affero General Public License
                                                along with WOT Game.  If not, see <http://www.gnu.org/licenses/>.
                                            */
                                            echo $this->start + self::INTERVAL;
                                            ?>
			</a>
			<?php 
                                            /*
                                              This file is part of WOT Game.
                                            
                                                WOT Game is free software: you can redistribute it and/or modify
                                                it under the terms of the GNU Affero General Public License as published by
                                                the Free Software Foundation, either version 3 of the License, or
                                                (at your option) any later version.
                                            
                                                WOT Game is distributed in the hope that it will be useful,
                                                but WITHOUT ANY WARRANTY; without even the implied warranty of
                                                MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                                                GNU Affero General Public License for more details.
                                            
                                                You should have received a copy of the GNU Affero General Public License
                                                along with WOT Game.  If not, see <http://www.gnu.org/licenses/>.
                                            */
                                            usleep(100000);
                                            ob_flush();
                                            flush();
                                            if (count($array)) {
                                                ?>
				<script>
					window.location.href = 'index.php?page=Lucene&action=process&luceneID=<?php 
                                                /*
                                                  This file is part of WOT Game.
                                                
                                                    WOT Game is free software: you can redistribute it and/or modify
                                                    it under the terms of the GNU Affero General Public License as published by
                                                    the Free Software Foundation, either version 3 of the License, or
                                                    (at your option) any later version.
                                                
                                                    WOT Game is distributed in the hope that it will be useful,
                                                    but WITHOUT ANY WARRANTY; without even the implied warranty of
                                                    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                                                    GNU Affero General Public License for more details.
                                                
                                                    You should have received a copy of the GNU Affero General Public License
                                                    along with WOT Game.  If not, see <http://www.gnu.org/licenses/>.
                                                */
                                                echo $this->luceneID;
                                                ?>
&start=<?php 
                                                /*
                                                  This file is part of WOT Game.
                                                
                                                    WOT Game is free software: you can redistribute it and/or modify
                                                    it under the terms of the GNU Affero General Public License as published by
                                                    the Free Software Foundation, either version 3 of the License, or
                                                    (at your option) any later version.
                                                
                                                    WOT Game is distributed in the hope that it will be useful,
                                                    but WITHOUT ANY WARRANTY; without even the implied warranty of
                                                    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                                                    GNU Affero General Public License for more details.
                                                
                                                    You should have received a copy of the GNU Affero General Public License
                                                    along with WOT Game.  If not, see <http://www.gnu.org/licenses/>.
                                                */
                                                echo $this->start + self::INTERVAL;
                                                ?>
';
				</script>
				<?php 
                                                /*
                                                  This file is part of WOT Game.
                                                
                                                    WOT Game is free software: you can redistribute it and/or modify
                                                    it under the terms of the GNU Affero General Public License as published by
                                                    the Free Software Foundation, either version 3 of the License, or
                                                    (at your option) any later version.
                                                
                                                    WOT Game is distributed in the hope that it will be useful,
                                                    but WITHOUT ANY WARRANTY; without even the implied warranty of
                                                    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                                                    GNU Affero General Public License for more details.
                                                
                                                    You should have received a copy of the GNU Affero General Public License
                                                    along with WOT Game.  If not, see <http://www.gnu.org/licenses/>.
                                                */
                                            }
                                            exit;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
コード例 #21
0
ファイル: Html.php プロジェクト: ookwudili/chisimba
 /**
  * Highlight text with specified color
  *
  * @param string|array $words
  * @param string $color
  * @return string
  */
 public function highlight($words, $color = '#66ffff')
 {
     if (!is_array($words)) {
         $words = array($words);
     }
     $wordsToHighlight = array();
     $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
     foreach ($words as $wordString) {
         $wordsToHighlight = array_merge($wordsToHighlight, $analyzer->tokenize($wordString));
     }
     if (count($wordsToHighlight) == 0) {
         return $this->_doc->saveHTML();
     }
     $wordsToHighlightFlipped = array();
     foreach ($wordsToHighlight as $id => $token) {
         $wordsToHighlightFlipped[$token->getTermText()] = $id;
     }
     $xpath = new DOMXPath($this->_doc);
     $matchedNodes = $xpath->query("/html/body");
     foreach ($matchedNodes as $matchedNode) {
         $this->_highlightNode($matchedNode, $wordsToHighlightFlipped, $color);
     }
 }
コード例 #22
0
ファイル: DocumentWriter.php プロジェクト: jkimdon/cohomeals
 /**
  * Adds a document to this segment.
  *
  * @param Zend_Search_Lucene_Document $document
  * @throws Zend_Search_Lucene_Exception
  */
 public function addDocument(Zend_Search_Lucene_Document $document)
 {
     /** Zend_Search_Lucene_Search_Similarity */
     // require_once 'Zend/Search/Lucene/Search/Similarity.php';
     $storedFields = array();
     $docNorms = array();
     $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
     foreach ($document->getFieldNames() as $fieldName) {
         $field = $document->getField($fieldName);
         if ($field->storeTermVector) {
             /**
              * @todo term vector storing support
              */
             // require_once 'Zend/Search/Lucene/Exception.php';
             throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
         }
         if ($field->isIndexed) {
             if ($field->isTokenized) {
                 /** Zend_Search_Lucene_Analysis_Analyzer */
                 // require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
                 $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
                 $analyzer->setInput($field->value, $field->encoding);
                 $position = 0;
                 $tokenCounter = 0;
                 while (($token = $analyzer->nextToken()) !== null) {
                     $tokenCounter++;
                     $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
                     $termKey = $term->key();
                     if (!isset($this->_termDictionary[$termKey])) {
                         // New term
                         $this->_termDictionary[$termKey] = $term;
                         $this->_termDocs[$termKey] = array();
                         $this->_termDocs[$termKey][$this->_docCount] = array();
                     } else {
                         if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
                             // Existing term, but new term entry
                             $this->_termDocs[$termKey][$this->_docCount] = array();
                         }
                     }
                     $position += $token->getPositionIncrement();
                     $this->_termDocs[$termKey][$this->_docCount][] = $position;
                 }
                 if ($tokenCounter == 0) {
                     // Field contains empty value. Treat it as non-indexed and non-tokenized
                     $field = clone $field;
                     $field->isIndexed = $field->isTokenized = false;
                 } else {
                     $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, $tokenCounter) * $document->boost * $field->boost));
                 }
             } else {
                 if (($fieldUtf8Value = $field->getUtf8Value()) == '') {
                     // Field contains empty value. Treat it as non-indexed and non-tokenized
                     $field = clone $field;
                     $field->isIndexed = $field->isTokenized = false;
                 } else {
                     $term = new Zend_Search_Lucene_Index_Term($fieldUtf8Value, $field->name);
                     $termKey = $term->key();
                     if (!isset($this->_termDictionary[$termKey])) {
                         // New term
                         $this->_termDictionary[$termKey] = $term;
                         $this->_termDocs[$termKey] = array();
                         $this->_termDocs[$termKey][$this->_docCount] = array();
                     } else {
                         if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
                             // Existing term, but new term entry
                             $this->_termDocs[$termKey][$this->_docCount] = array();
                         }
                     }
                     $this->_termDocs[$termKey][$this->_docCount][] = 0;
                     // position
                     $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, 1) * $document->boost * $field->boost));
                 }
             }
         }
         if ($field->isStored) {
             $storedFields[] = $field;
         }
         $this->addField($field);
     }
     foreach ($this->_fields as $fieldName => $field) {
         if (!$field->isIndexed) {
             continue;
         }
         if (!isset($this->_norms[$fieldName])) {
             $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))), $this->_docCount);
         }
         if (isset($docNorms[$fieldName])) {
             $this->_norms[$fieldName] .= $docNorms[$fieldName];
         } else {
             $this->_norms[$fieldName] .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0)));
         }
     }
     $this->addStoredFields($storedFields);
 }
コード例 #23
0
 public function test_Indexer_Has_Correct_Config()
 {
     $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
     $this->assertType('Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive', $analyzer);
 }
コード例 #24
0
ファイル: Fuzzy.php プロジェクト: netconstructor/Centurion
 /**
  * Query specific matches highlighting
  *
  * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter  Highlighter object (also contains doc for highlighting)
  */
 protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
 {
     $words = array();
     //$1 'Zend/Search/Lucene/Index/Term.php';
     $prefix = Zend_Search_Lucene_Index_Term::getPrefix($this->_term->text, $this->_prefixLength);
     $prefixByteLength = strlen($prefix);
     $prefixUtf8Length = Zend_Search_Lucene_Index_Term::getLength($prefix);
     $termLength = Zend_Search_Lucene_Index_Term::getLength($this->_term->text);
     $termRest = substr($this->_term->text, $prefixByteLength);
     // we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible
     $termRestLength = strlen($termRest);
     $scaleFactor = 1 / (1 - $this->_minimumSimilarity);
     $docBody = $highlighter->getDocument()->getFieldUtf8Value('body');
     //$1 'Zend/Search/Lucene/Analysis/Analyzer.php';
     $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($docBody, 'UTF-8');
     foreach ($tokens as $token) {
         $termText = $token->getTermText();
         if (substr($termText, 0, $prefixByteLength) == $prefix) {
             // Calculate similarity
             $target = substr($termText, $prefixByteLength);
             $maxDistance = isset($this->_maxDistances[strlen($target)]) ? $this->_maxDistances[strlen($target)] : $this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target));
             if ($termRestLength == 0) {
                 // we don't have anything to compare.  That means if we just add
                 // the letters for current term we get the new word
                 $similarity = $prefixUtf8Length == 0 ? 0 : 1 - strlen($target) / $prefixUtf8Length;
             } else {
                 if (strlen($target) == 0) {
                     $similarity = $prefixUtf8Length == 0 ? 0 : 1 - $termRestLength / $prefixUtf8Length;
                 } else {
                     if ($maxDistance < abs($termRestLength - strlen($target))) {
                         //just adding the characters of term to target or vice-versa results in too many edits
                         //for example "pre" length is 3 and "prefixes" length is 8.  We can see that
                         //given this optimal circumstance, the edit distance cannot be less than 5.
                         //which is 8-3 or more precisesly abs(3-8).
                         //if our maximum edit distance is 4, then we can discard this word
                         //without looking at it.
                         $similarity = 0;
                     } else {
                         $similarity = 1 - levenshtein($termRest, $target) / ($prefixUtf8Length + min($termRestLength, strlen($target)));
                     }
                 }
             }
             if ($similarity > $this->_minimumSimilarity) {
                 $words[] = $termText;
             }
         }
     }
     $highlighter->highlight($words);
 }
コード例 #25
0
ファイル: Term.php プロジェクト: sraj4/EthicsPublicHtmlProd
 /**
  * Transform entry to a subquery
  *
  * @param string $encoding
  * @return Zend_Search_Lucene_Search_Query
  * @throws Zend_Search_Lucene_Search_QueryParserException
  */
 public function getQuery($encoding)
 {
     if (strpos($this->_term, '?') !== false || strpos($this->_term, '*') !== false) {
         if ($this->_fuzzyQuery) {
             // require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
             throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is not supported for terms with wildcards.');
         }
         $pattern = '';
         $subPatterns = explode('*', $this->_term);
         $astericFirstPass = true;
         foreach ($subPatterns as $subPattern) {
             if (!$astericFirstPass) {
                 $pattern .= '*';
             } else {
                 $astericFirstPass = false;
             }
             $subPatternsL2 = explode('?', $subPattern);
             $qMarkFirstPass = true;
             foreach ($subPatternsL2 as $subPatternL2) {
                 if (!$qMarkFirstPass) {
                     $pattern .= '?';
                 } else {
                     $qMarkFirstPass = false;
                 }
                 $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPatternL2, $encoding);
                 if (count($tokens) > 1) {
                     // require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
                     throw new Zend_Search_Lucene_Search_QueryParserException('Wildcard search is supported only for non-multiple word terms');
                 }
                 foreach ($tokens as $token) {
                     $pattern .= $token->getTermText();
                 }
             }
         }
         $term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field);
         $query = new Zend_Search_Lucene_Search_Query_Wildcard($term);
         $query->setBoost($this->_boost);
         return $query;
     }
     $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_term, $encoding);
     if (count($tokens) == 0) {
         return new Zend_Search_Lucene_Search_Query_Insignificant();
     }
     if (count($tokens) == 1 && !$this->_fuzzyQuery) {
         $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
         $query = new Zend_Search_Lucene_Search_Query_Term($term);
         $query->setBoost($this->_boost);
         return $query;
     }
     if (count($tokens) == 1 && $this->_fuzzyQuery) {
         $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
         $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_similarity);
         $query->setBoost($this->_boost);
         return $query;
     }
     if ($this->_fuzzyQuery) {
         // require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
         throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is supported only for non-multiple word terms');
     }
     //It's not empty or one term query
     $query = new Zend_Search_Lucene_Search_Query_MultiTerm();
     /**
      * @todo Process $token->getPositionIncrement() to support stemming, synonyms and other
      * analizer design features
      */
     foreach ($tokens as $token) {
         $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field);
         $query->addTerm($term, true);
         // all subterms are required
     }
     $query->setBoost($this->_boost);
     return $query;
 }
コード例 #26
0
ファイル: Wildcard.php プロジェクト: netvlies/zf
 /**
  * Query specific matches highlighting
  *
  * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter  Highlighter object (also contains doc for highlighting)
  */
 protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
 {
     $words = array();
     $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*'), preg_quote($this->_pattern->text, '/')) . '$/';
     if (@preg_match('/\\pL/u', 'a') == 1) {
         // PCRE unicode support is turned on
         // add Unicode modifier to the match expression
         $matchExpression .= 'u';
     }
     $docBody = $highlighter->getDocument()->getFieldUtf8Value('body');
     // require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
     $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($docBody, 'UTF-8');
     foreach ($tokens as $token) {
         if (preg_match($matchExpression, $token->getTermText()) === 1) {
             $words[] = $token->getTermText();
         }
     }
     $highlighter->highlight($words);
 }
コード例 #27
0
 /**
  * Transform entry to a subquery
  *
  * @return Zend_Search_Lucene_Search_Query
  * @throws Zend_Search_Lucene_Search_QueryParserException
  */
 public function getQuery()
 {
     if (strpos($this->_phrase, '?') !== false || strpos($this->_phrase, '*') !== false) {
         throw new Zend_Search_Lucene_Search_QueryParserException('Wildcards are only allowed in a single terms.');
     }
     $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_phrase);
     if (count($tokens) == 0) {
         return new Zend_Search_Lucene_Search_Query_Empty();
     }
     if (count($tokens) == 1) {
         $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
         $query = new Zend_Search_Lucene_Search_Query_Term($term);
         $query->setBoost($this->_boost);
         return $query;
     }
     //It's not empty or one term query
     $query = new Zend_Search_Lucene_Search_Query_Phrase();
     foreach ($tokens as $token) {
         $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field);
         $query->addTerm($term, true);
         // all subterms are required
     }
     if ($this->_proximityQuery) {
         $query->setSlop($this->_wordsDistance);
     }
     $query->setBoost($this->_boost);
     return $query;
 }