Analyzer::getDefault, ZendSearch\Lucene\Analysis\Analyzer PHPのコード例

コード例 #1

0

ファイルを表示

ファイル: SearchController.php プロジェクト: diskojulien/keratine-skeleton

 public function searchAction(Request $request)
 {
     $queryString = $request->get('query');
     // count total documents indexed
     $numDocs = $this->get('zendsearch')->numDocs();
     // parse query string and return a Query object.
     // $query = Search\QueryParser::parse($queryString, 'UTF-8');
     $queryTokens = Analyzer::getDefault()->tokenize($queryString, 'UTF-8');
     $query = new Search\Query\Boolean();
     foreach ($queryTokens as $token) {
         $query->addSubquery(new Search\Query\Fuzzy(new Index\Term($token->getTermText()), 0.5), null);
     }
     // process query
     $results = $this->get('zendsearch')->find($query);
     // sort results by score (MultiSearch does not sort the results between the differents indices)
     usort($results, create_function('$a, $b', 'return $a->score < $b->score;'));
     // // paginate results
     // $results = new \Zend\Paginator\Paginator(new \Zend\Paginator\Adapter\ArrayAdapter($results));
     // $results->setCurrentPageNumber($page);
     // $results->setItemCountPerPage($rpp);
     // // fetch results entities
     // $dataResults = array();
     // foreach ($results as $hit) {
     //     $document = $hit->getDocument();
     //     $repository = $this->get('orm.em')->getRepository( $document->getFieldValue('entityClass') );
     //     $dataResults[] = $repository->find( $document->getFieldValue('id') );
     // }
     // $results = $dataResults;
     return $this->get('twig')->render('admin/search.html.twig', array('query' => $queryString, 'numDocs' => $numDocs, 'results' => $results));
 }

コード例 #2

0

ファイルを表示

ファイル: ConfigTest.php プロジェクト: Cliffus/laravel-lucene-search

 public function testSetHighlighterAnalyzer()
 {
     $this->app->instance('filterClass1', $tokenFilterMock = m::mock('ZendSearch\\Lucene\\Analysis\\TokenFilter\\TokenFilterInterface'));
     $this->analyzer->shouldReceive('addFilter')->with($tokenFilterMock)->once();
     $this->config->setHighlighterAnalyzer();
     $this->assertEquals($this->analyzer, Analyzer::getDefault());
 }

コード例 #3

0

ファイルを表示

ファイル: AnalysisTest.php プロジェクト: tonylow/skillslink

 public function testAnalyzer()
 {
     $currentAnalyzer = Analyzer::getDefault();
     $this->assertTrue($currentAnalyzer instanceof AnalyzerInterface);
     /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
     $newAnalyzer = new Common\Utf8Num();
     Analyzer::setDefault($newAnalyzer);
     $this->assertTrue(Analyzer::getDefault() === $newAnalyzer);
     // Set analyzer to the default value (used in other tests)
     Analyzer::setDefault($currentAnalyzer);
 }

コード例 #4

0

ファイルを表示

ファイル: ImgHlController.php プロジェクト: OurDigitalWorld/imgworks

 public function sortOutHlCoords()
 {
     //Lucene operators
     $operators = array("and", "or", "not");
     $config = $this->getServiceLocator()->get('config');
     $paramInfo = $this->sortOutParams($config);
     //collect building blocks
     $resLoc = $paramInfo['resLoc'];
     $site = $paramInfo['site'];
     $collection = $paramInfo['collection'];
     $container = $paramInfo['container'];
     $reel = $paramInfo['reel'];
     $page = $paramInfo['page'];
     //the all important query
     $hl = $this->params()->fromRoute('hl', '');
     //coordinates to pass back
     $coords = [];
     //pass back empty coordinate set if any of these parameters
     //are missing
     if ($this->isNullOrEmpty($reel) || $this->isNullOrEmpty($page) || $this->isNullOrEmpty($hl)) {
         return array("imgloc" => '', "indloc" => '', "coords" => $coords);
     }
     //if
     //location of files - ODW file layout
     $resLoc .= '/' . $site . '/' . $collection . '/' . $container . '/' . $reel . '/odw/' . $page . '/';
     $imgLoc = $resLoc . '../../' . $page . '.jpg';
     $iaLoc = $resLoc . 'ia/' . $page . '.jpg';
     //not all images will have IA derivative
     if (file_exists($iaLoc) !== false) {
         $imgLoc = $iaLoc;
     }
     $indLoc = $resLoc . 'index/imgworks';
     //need index directory and segments file to be valid lucene layout
     if (!file_exists($indLoc . '/segments.gen')) {
         return array("imgloc" => $imgLoc, "indloc" => $indLoc, "coords" => $coords);
     }
     //get coordinates from Lucene index
     $searchText = '';
     //use Lucene tokens for searching
     $queryTokens = Analyzer\Analyzer::getDefault()->tokenize($hl);
     foreach ($queryTokens as $token) {
         $searchTerm = $token->getTermText();
         if (!in_array($searchTerm, $operators)) {
             //no snowball analyzer or other stemming option
             //in Lucene 2.x, so create stem seperately
             $searchText .= stem_english($searchTerm);
             //Lucene dropped this limitation after 2.x
             //but this version won't wildcard without
             //at least 3 characters in term
             if (strlen($searchTerm) >= 3) {
                 $searchText .= "* ";
             }
             //if strlen
         }
         //if
     }
     //foreach
     //now do search
     $index = Lucene\Lucene::open($indLoc);
     $searchResults = $index->find($searchText);
     //assemble results
     foreach ($searchResults as $searchResult) {
         array_push($coords, [$searchResult->x1, $searchResult->y1, $searchResult->x2, $searchResult->y2]);
     }
     //foreach
     //pass back image and index location in addition to results
     return array("imgloc" => $imgLoc, "indloc" => $indLoc, "coords" => $coords);
 }

コード例 #5

0

ファイルを表示

ファイル: DocumentWriter.php プロジェクト: avbrugen/uace-laravel

 /**
  * Adds a document to this segment.
  *
  * @param \ZendSearch\Lucene\Document $document
  * @throws LuceneException\UnsupportedMethodCallException
  */
 public function addDocument(Document $document)
 {
     $storedFields = array();
     $docNorms = array();
     $similarity = AbstractSimilarity::getDefault();
     foreach ($document->getFieldNames() as $fieldName) {
         $field = $document->getField($fieldName);
         if ($field->storeTermVector) {
             /**
              * @todo term vector storing support
              */
             throw new LuceneException\UnsupportedMethodCallException('Store term vector functionality is not supported yet.');
         }
         if ($field->isIndexed) {
             if ($field->isTokenized) {
                 $analyzer = Analyzer\Analyzer::getDefault();
                 $analyzer->setInput($field->value, $field->encoding);
                 $position = 0;
                 $tokenCounter = 0;
                 while (($token = $analyzer->nextToken()) !== null) {
                     $tokenCounter++;
                     $term = new Index\Term($token->getTermText(), $field->name);
                     $termKey = $term->key();
                     if (!isset($this->_termDictionary[$termKey])) {
                         // New term
                         $this->_termDictionary[$termKey] = $term;
                         $this->_termDocs[$termKey] = array();
                         $this->_termDocs[$termKey][$this->_docCount] = array();
                     } elseif (!isset($this->_termDocs[$termKey][$this->_docCount])) {
                         // Existing term, but new term entry
                         $this->_termDocs[$termKey][$this->_docCount] = array();
                     }
                     $position += $token->getPositionIncrement();
                     $this->_termDocs[$termKey][$this->_docCount][] = $position;
                 }
                 if ($tokenCounter == 0) {
                     // Field contains empty value. Treat it as non-indexed and non-tokenized
                     $field = clone $field;
                     $field->isIndexed = $field->isTokenized = false;
                 } else {
                     $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, $tokenCounter) * $document->boost * $field->boost));
                 }
             } elseif (($fieldUtf8Value = $field->getUtf8Value()) == '') {
                 // Field contains empty value. Treat it as non-indexed and non-tokenized
                 $field = clone $field;
                 $field->isIndexed = $field->isTokenized = false;
             } else {
                 $term = new Index\Term($fieldUtf8Value, $field->name);
                 $termKey = $term->key();
                 if (!isset($this->_termDictionary[$termKey])) {
                     // New term
                     $this->_termDictionary[$termKey] = $term;
                     $this->_termDocs[$termKey] = array();
                     $this->_termDocs[$termKey][$this->_docCount] = array();
                 } elseif (!isset($this->_termDocs[$termKey][$this->_docCount])) {
                     // Existing term, but new term entry
                     $this->_termDocs[$termKey][$this->_docCount] = array();
                 }
                 $this->_termDocs[$termKey][$this->_docCount][] = 0;
                 // position
                 $docNorms[$field->name] = chr($similarity->encodeNorm($similarity->lengthNorm($field->name, 1) * $document->boost * $field->boost));
             }
         }
         if ($field->isStored) {
             $storedFields[] = $field;
         }
         $this->addField($field);
     }
     foreach ($this->_fields as $fieldName => $field) {
         if (!$field->isIndexed) {
             continue;
         }
         if (!isset($this->_norms[$fieldName])) {
             $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0))), $this->_docCount);
         }
         if (isset($docNorms[$fieldName])) {
             $this->_norms[$fieldName] .= $docNorms[$fieldName];
         } else {
             $this->_norms[$fieldName] .= chr($similarity->encodeNorm($similarity->lengthNorm($fieldName, 0)));
         }
     }
     $this->addStoredFields($storedFields);
 }

コード例 #6

0

ファイルを表示

ファイル: Search23Test.php プロジェクト: tonylow/skillslink

 public function testFilteredTokensQueryParserProcessing()
 {
     $index = Lucene\Lucene::open(__DIR__ . '/_index23Sample/_files');
     $this->assertEquals(count(\ZendSearch\Lucene\Analysis\Analyzer\Analyzer::getDefault()->tokenize('123456787654321')), 0);
     $hits = $index->find('"PEAR developers" AND Home AND 123456787654321');
     $this->assertEquals(count($hits), 1);
     $expectedResultset = array(array(1, 0.16827, 'IndexSource/contributing.wishlist.html'));
     foreach ($hits as $resId => $hit) {
         $this->assertEquals($hit->id, $expectedResultset[$resId][0]);
         $this->assertTrue(abs($hit->score - $expectedResultset[$resId][1]) < 1.0E-6);
         $this->assertEquals($hit->path, $expectedResultset[$resId][2]);
     }
 }

コード例 #7

0

ファイルを表示

ファイル: Fuzzy.php プロジェクト: avbrugen/uace-laravel

 /**
  * Query specific matches highlighting
  *
  * @param Highlighter $highlighter  Highlighter object (also contains doc for highlighting)
  */
 protected function _highlightMatches(Highlighter $highlighter)
 {
     $words = array();
     $prefix = Index\Term::getPrefix($this->_term->text, $this->_prefixLength);
     $prefixByteLength = strlen($prefix);
     $prefixUtf8Length = Index\Term::getLength($prefix);
     $termLength = Index\Term::getLength($this->_term->text);
     $termRest = substr($this->_term->text, $prefixByteLength);
     // we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible
     $termRestLength = strlen($termRest);
     $scaleFactor = 1 / (1 - $this->_minimumSimilarity);
     $docBody = $highlighter->getDocument()->getFieldUtf8Value('body');
     $tokens = Lucene\Analysis\Analyzer\Analyzer::getDefault()->tokenize($docBody, 'UTF-8');
     foreach ($tokens as $token) {
         $termText = $token->getTermText();
         if (substr($termText, 0, $prefixByteLength) == $prefix) {
             // Calculate similarity
             $target = substr($termText, $prefixByteLength);
             $maxDistance = isset($this->_maxDistances[strlen($target)]) ? $this->_maxDistances[strlen($target)] : $this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target));
             if ($termRestLength == 0) {
                 // we don't have anything to compare.  That means if we just add
                 // the letters for current term we get the new word
                 $similarity = $prefixUtf8Length == 0 ? 0 : 1 - strlen($target) / $prefixUtf8Length;
             } elseif (strlen($target) == 0) {
                 $similarity = $prefixUtf8Length == 0 ? 0 : 1 - $termRestLength / $prefixUtf8Length;
             } elseif ($maxDistance < abs($termRestLength - strlen($target))) {
                 //just adding the characters of term to target or vice-versa results in too many edits
                 //for example "pre" length is 3 and "prefixes" length is 8.  We can see that
                 //given this optimal circumstance, the edit distance cannot be less than 5.
                 //which is 8-3 or more precisesly abs(3-8).
                 //if our maximum edit distance is 4, then we can discard this word
                 //without looking at it.
                 $similarity = 0;
             } else {
                 $similarity = 1 - levenshtein($termRest, $target) / ($prefixUtf8Length + min($termRestLength, strlen($target)));
             }
             if ($similarity > $this->_minimumSimilarity) {
                 $words[] = $termText;
             }
         }
     }
     $highlighter->highlight($words);
 }

コード例 #8

0

ファイルを表示

ファイル: HTML.php プロジェクト: tonylow/skillslink

 /**
  * Highlight text using specified View helper or callback function.
  *
  * @param string|array $words  Words to highlight. Words could be organized using the array or string.
  * @param callback $callback   Callback method, used to transform (highlighting) text.
  * @param array    $params     Array of additionall callback parameters passed through into it
  *                             (first non-optional parameter is an HTML fragment for highlighting)
  * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException
  * @return string
  */
 public function highlightExtended($words, $callback, $params = array())
 {
     if (!is_array($words)) {
         $words = array($words);
     }
     $wordsToHighlightList = array();
     $analyzer = Analyzer\Analyzer::getDefault();
     foreach ($words as $wordString) {
         $wordsToHighlightList[] = $analyzer->tokenize($wordString, $this->_doc->encoding);
     }
     $wordsToHighlight = call_user_func_array('array_merge', $wordsToHighlightList);
     if (count($wordsToHighlight) == 0) {
         return $this->_doc->saveHTML();
     }
     $wordsToHighlightFlipped = array();
     foreach ($wordsToHighlight as $id => $token) {
         $wordsToHighlightFlipped[$token->getTermText()] = $id;
     }
     if (!is_callable($callback)) {
         throw new InvalidArgumentException('$viewHelper parameter mast be a View Helper name, View Helper object or callback.');
     }
     $xpath = new \DOMXPath($this->_doc);
     $matchedNodes = $xpath->query("/html/body");
     foreach ($matchedNodes as $matchedNode) {
         $this->_highlightNodeRecursive($matchedNode, $wordsToHighlightFlipped, $callback, $params);
     }
 }

コード例 #9

0

ファイルを表示

ファイル: Wildcard.php プロジェクト: avbrugen/uace-laravel

 /**
  * Query specific matches highlighting
  *
  * @param Highlighter $highlighter  Highlighter object (also contains doc for highlighting)
  */
 protected function _highlightMatches(Highlighter $highlighter)
 {
     $words = array();
     $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*'), preg_quote($this->_pattern->text, '/')) . '$/';
     ErrorHandler::start(E_WARNING);
     $result = preg_match('/\\pL/u', 'a');
     ErrorHandler::stop();
     if ($result == 1) {
         // PCRE unicode support is turned on
         // add Unicode modifier to the match expression
         $matchExpression .= 'u';
     }
     $docBody = $highlighter->getDocument()->getFieldUtf8Value('body');
     $tokens = Analyzer::getDefault()->tokenize($docBody, 'UTF-8');
     foreach ($tokens as $token) {
         if (preg_match($matchExpression, $token->getTermText()) === 1) {
             $words[] = $token->getTermText();
         }
     }
     $highlighter->highlight($words);
 }

コード例 #10

0

ファイルを表示

ファイル: Range.php プロジェクト: avbrugen/uace-laravel

 /**
  * Query specific matches highlighting
  *
  * @param Highlighter $highlighter  Highlighter object (also contains doc for highlighting)
  */
 protected function _highlightMatches(Highlighter $highlighter)
 {
     $words = array();
     $docBody = $highlighter->getDocument()->getFieldUtf8Value('body');
     $tokens = Lucene\Analysis\Analyzer\Analyzer::getDefault()->tokenize($docBody, 'UTF-8');
     $lowerTermText = $this->_lowerTerm !== null ? $this->_lowerTerm->text : null;
     $upperTermText = $this->_upperTerm !== null ? $this->_upperTerm->text : null;
     if ($this->_inclusive) {
         foreach ($tokens as $token) {
             $termText = $token->getTermText();
             if (($lowerTermText == null || $lowerTermText <= $termText) && ($upperTermText == null || $termText <= $upperTermText)) {
                 $words[] = $termText;
             }
         }
     } else {
         foreach ($tokens as $token) {
             $termText = $token->getTermText();
             if (($lowerTermText == null || $lowerTermText < $termText) && ($upperTermText == null || $termText < $upperTermText)) {
                 $words[] = $termText;
             }
         }
     }
     $highlighter->highlight($words);
 }

コード例 #11

0

ファイルを表示

ファイル: QueryParser.php プロジェクト: avbrugen/uace-laravel

 /**
  * Process last range query term (closed interval)
  *
  * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException
  */
 public function closedRQLastTerm()
 {
     $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_rqFirstTerm, $this->_encoding);
     if (count($tokens) > 1) {
         throw new QueryParserException('Range query boundary terms must be non-multiple word terms');
     } elseif (count($tokens) == 1) {
         $from = new Index\Term(reset($tokens)->getTermText(), $this->_context->getField());
     } else {
         $from = null;
     }
     $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_currentToken->text, $this->_encoding);
     if (count($tokens) > 1) {
         throw new QueryParserException('Range query boundary terms must be non-multiple word terms');
     } elseif (count($tokens) == 1) {
         $to = new Index\Term(reset($tokens)->getTermText(), $this->_context->getField());
     } else {
         $to = null;
     }
     if ($from === null && $to === null) {
         throw new QueryParserException('At least one range query boundary term must be non-empty term');
     }
     $rangeQuery = new Query\Range($from, $to, true);
     $entry = new QueryEntry\Subquery($rangeQuery);
     $this->_context->addEntry($entry);
 }

コード例 #12

0

ファイルを表示

ファイル: Term.php プロジェクト: avbrugen/uace-laravel

 /**
  * Query specific matches highlighting
  *
  * @param Highlighter $highlighter  Highlighter object (also contains doc for highlighting)
  */
 protected function _highlightMatches(Highlighter $highlighter)
 {
     /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
     /** Skip exact term matching recognition, keyword fields highlighting is not supported */
     // -------------------------------------
     // Recognize wildcard queries
     /** 
      * @todo check for PCRE unicode support may be performed through Zend_Environment in some future 
      */
     ErrorHandler::start(E_WARNING);
     $result = preg_match('/\\pL/u', 'a');
     ErrorHandler::stop();
     if ($result == 1) {
         $word = iconv($this->_encoding, 'UTF-8', $this->_word);
         $wildcardsPattern = '/[*?]/u';
         $subPatternsEncoding = 'UTF-8';
     } else {
         $word = $this->_word;
         $wildcardsPattern = '/[*?]/';
         $subPatternsEncoding = $this->_encoding;
     }
     $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);
     if (count($subPatterns) > 1) {
         // Wildcard query is recognized
         $pattern = '';
         foreach ($subPatterns as $id => $subPattern) {
             // Append corresponding wildcard character to the pattern before each sub-pattern (except first)
             if ($id != 0) {
                 $pattern .= $word[$subPattern[1] - 1];
             }
             // Check if each subputtern is a single word in terms of current analyzer
             $tokens = Analyzer\Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);
             if (count($tokens) > 1) {
                 // Do nothing (nothing is highlighted)
                 return;
             }
             foreach ($tokens as $token) {
                 $pattern .= $token->getTermText();
             }
         }
         $term = new Index\Term($pattern, $this->_field);
         $query = new Query\Wildcard($term);
         $query->_highlightMatches($highlighter);
         return;
     }
     // -------------------------------------
     // Recognize one-term multi-term and "insignificant" queries
     $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
     if (count($tokens) == 0) {
         // Do nothing
         return;
     }
     if (count($tokens) == 1) {
         $highlighter->highlight($tokens[0]->getTermText());
         return;
     }
     //It's not insignificant or one term query
     $words = array();
     foreach ($tokens as $token) {
         $words[] = $token->getTermText();
     }
     $highlighter->highlight($words);
 }

コード例 #13

0

ファイルを表示

ファイル: Phrase.php プロジェクト: avbrugen/uace-laravel

 /**
  * Query specific matches highlighting
  *
  * @param Highlighter $highlighter  Highlighter object (also contains doc for highlighting)
  */
 protected function _highlightMatches(Highlighter $highlighter)
 {
     /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
     /** Skip exact term matching recognition, keyword fields highlighting is not supported */
     /** Skip wildcard queries recognition. Supported wildcards are removed by text analyzer */
     // tokenize phrase using current analyzer and process it as a phrase query
     $tokens = Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding);
     if (count($tokens) == 0) {
         // Do nothing
         return;
     }
     if (count($tokens) == 1) {
         $highlighter->highlight($tokens[0]->getTermText());
         return;
     }
     //It's non-trivial phrase query
     $words = array();
     foreach ($tokens as $token) {
         $words[] = $token->getTermText();
     }
     $highlighter->highlight($words);
 }

コード例 #14

0

ファイルを表示

ファイル: Fuzzy.php プロジェクト: avbrugen/uace-laravel

 /**
  * Query specific matches highlighting
  *
  * @param Highlighter $highlighter  Highlighter object (also contains doc for highlighting)
  */
 protected function _highlightMatches(Highlighter $highlighter)
 {
     /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
     /** Skip exact term matching recognition, keyword fields highlighting is not supported */
     // -------------------------------------
     // Recognize wildcard queries
     /** 
      * @todo check for PCRE unicode support may be performed through Zend_Environment in some future 
      */
     ErrorHandler::start(E_WARNING);
     $result = preg_match('/\\pL/u', 'a');
     ErrorHandler::stop();
     if ($result == 1) {
         $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));
     } else {
         $subPatterns = preg_split('/[*?]/', $this->_word);
     }
     if (count($subPatterns) > 1) {
         // Do nothing
         return;
     }
     // -------------------------------------
     // Recognize one-term multi-term and "insignificant" queries
     $tokens = Analyzer\Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
     if (count($tokens) == 0) {
         // Do nothing
         return;
     }
     if (count($tokens) == 1) {
         $term = new Index\Term($tokens[0]->getTermText(), $this->_field);
         $query = new Query\Fuzzy($term, $this->_minimumSimilarity);
         $query->_highlightMatches($highlighter);
         return;
     }
     // Word is tokenized into several tokens
     // But fuzzy search is supported only for non-multiple word terms
     // Do nothing
 }

PHP ZendSearch\Lucene\Analysis\Analyzer Analyzer::getDefaultの例