/** * Apply English normalizing logic to string. * * @param array $words * @return array */ public static function stemWord($string) { if (extension_loaded('stem')) { return stem_english($string); } return $string; }
static function _stemWord($word) { $oldlocale = \setlocale(LC_ALL, 0); \setlocale(LC_ALL, 'C'); $word = \stem_russian($word); $word = \stem_english($word); \setlocale(LC_ALL, $oldlocale); return $word; }
public static function Stem($words) { $result = array(); $ex = array(); foreach ($words as $key => $word) { if (isset(self::$cache[$word])) { $tmp = self::$cache[$word]; } else { if (substr($key, 0, 2) == 'ru') { $tmp = stem_russian_unicode($word); } else { $tmp = stem_english($word); } self::$cache[$word] = $tmp; } if (!isset($ex[$tmp])) { $result[] = $tmp; $ex[$tmp] = 1; } } return $result; }
public function sortOutHlCoords() { //Lucene operators $operators = array("and", "or", "not"); $config = $this->getServiceLocator()->get('config'); $paramInfo = $this->sortOutParams($config); //collect building blocks $resLoc = $paramInfo['resLoc']; $site = $paramInfo['site']; $collection = $paramInfo['collection']; $container = $paramInfo['container']; $reel = $paramInfo['reel']; $page = $paramInfo['page']; //the all important query $hl = $this->params()->fromRoute('hl', ''); //coordinates to pass back $coords = []; //pass back empty coordinate set if any of these parameters //are missing if ($this->isNullOrEmpty($reel) || $this->isNullOrEmpty($page) || $this->isNullOrEmpty($hl)) { return array("imgloc" => '', "indloc" => '', "coords" => $coords); } //if //location of files - ODW file layout $resLoc .= '/' . $site . '/' . $collection . '/' . $container . '/' . $reel . '/odw/' . $page . '/'; $imgLoc = $resLoc . '../../' . $page . '.jpg'; $iaLoc = $resLoc . 'ia/' . $page . '.jpg'; //not all images will have IA derivative if (file_exists($iaLoc) !== false) { $imgLoc = $iaLoc; } $indLoc = $resLoc . 'index/imgworks'; //need index directory and segments file to be valid lucene layout if (!file_exists($indLoc . '/segments.gen')) { return array("imgloc" => $imgLoc, "indloc" => $indLoc, "coords" => $coords); } //get coordinates from Lucene index $searchText = ''; //use Lucene tokens for searching $queryTokens = Analyzer\Analyzer::getDefault()->tokenize($hl); foreach ($queryTokens as $token) { $searchTerm = $token->getTermText(); if (!in_array($searchTerm, $operators)) { //no snowball analyzer or other stemming option //in Lucene 2.x, so create stem seperately $searchText .= stem_english($searchTerm); //Lucene dropped this limitation after 2.x //but this version won't wildcard without //at least 3 characters in term if (strlen($searchTerm) >= 3) { $searchText .= "* "; } //if strlen } //if } //foreach //now do search $index = Lucene\Lucene::open($indLoc); $searchResults = $index->find($searchText); //assemble results foreach ($searchResults as $searchResult) { array_push($coords, [$searchResult->x1, $searchResult->y1, $searchResult->x2, $searchResult->y2]); } //foreach //pass back image and index location in addition to results return array("imgloc" => $imgLoc, "indloc" => $indLoc, "coords" => $coords); }