/** * Object constructor * * @param string $data * @param boolean $storeContent * @throws NotIndexedException */ private function __construct($data, $storeContent) { //TODO check PDF >1.5 metadata extraction //do the content extraction $parser = new Parser(); try { $pdf = $parser->parseContent($data); $body = $pdf->getText(); // Store contents if ($storeContent) { $this->addField(Document\Field::Text('body', $body, 'UTF-8')); } else { $this->addField(Document\Field::UnStored('body', $body, 'UTF-8')); } $details = $pdf->getDetails(); // Store meta data properties foreach ($details as $key => $value) { $key = strtolower($key); if ($key === 'author') { $key = 'creator'; } $this->addField(Document\Field::Text($key, $value, 'UTF-8')); } } catch (\Exception $ex) { throw new NotIndexedException(null, null, $ex); } }
public function add(Searchable $obj) { // Get Primary Key $attributes = $obj->getSearchAttributes(); $index = $this->getIndex(); $doc = new \ZendSearch\Lucene\Document(); // Add Meta Data fields foreach ($this->getMetaInfoArray($obj) as $fieldName => $fieldValue) { $doc->addField(\ZendSearch\Lucene\Document\Field::keyword($fieldName, $fieldValue)); } // Add provided search infos foreach ($attributes as $key => $val) { $doc->addField(\ZendSearch\Lucene\Document\Field::Text($key, $val, 'UTF-8')); } // Add comments - if record is content if ($obj instanceof ContentActiveRecord) { $comments = ""; foreach (Comment::findAll(['object_id' => $obj->getPrimaryKey(), 'object_model' => $obj->className()]) as $comment) { $comments .= " " . $comment->message; } $doc->addField(\ZendSearch\Lucene\Document\Field::Text('comments', $comments, 'UTF-8')); } if (\Yii::$app->request->isConsoleRequest) { print "."; } $index->addDocument($doc); $index->commit(); }
public function testAddFieldMethodChaining() { $document = new Document(); $this->assertTrue($document->addField(Document\Field::Text('title', 'Title')) instanceof Document); $document = new Document(); $document->addField(Document\Field::Text('title', 'Title'))->addField(Document\Field::Text('annotation', 'Annotation'))->addField(Document\Field::Text('body', 'Document body, document body, document body...')); }
/** * @dataProvider searchResultDataProvider */ function testSearchLuceneResultContent($fileId, $name, $path, $size, $score, $mimeType, $modified, $container) { require_once __DIR__ . '/util/dummyindex.php'; $index = new DummyIndex(); $doc = new Document(); $doc->addField(Document\Field::Keyword('fileId', $fileId)); $doc->addField(Document\Field::Text('path', '/test/files' . $path, 'UTF-8')); $doc->addField(Document\Field::unIndexed('mtime', $modified)); $doc->addField(Document\Field::unIndexed('size', $size)); $doc->addField(Document\Field::unIndexed('mimetype', $mimeType)); $index->addDocument($doc); $hit = new QueryHit($index); $hit->score = $score; $hit->id = 0; $hit->document_id = 0; $searchResult = new \OCA\Search_Lucene\Search\LuceneResult($hit); $this->assertInstanceOf('OCA\\Search_Lucene\\Search\\LuceneResult', $searchResult); $this->assertEquals($fileId, $searchResult->id); $this->assertEquals('lucene', $searchResult->type); $this->assertEquals($path, $searchResult->path); $this->assertEquals($name, $searchResult->name); $this->assertEquals($mimeType, $searchResult->mime_type); $this->assertEquals($size, $searchResult->size); $this->assertEquals($score, $searchResult->score); $this->assertEquals($modified, $searchResult->modified); }
public function testUpdate() { $this->connection->shouldReceive('getIndex')->andReturn($luceneIndex = m::mock()); $luceneIndex->shouldReceive('addDocument')->with(m::on(function ($arg) { $doc = new Document(); $doc->addField(Field::keyword('primary_key', 1)); $doc->addField(Field::Keyword('class_uid', '12345')); $field = Field::unStored('name', 'test name'); $field->boost = 1; $doc->addField($field); $field = Field::unStored('optional_attribute1', 'optional value'); $field->boost = 1; $doc->addField($field); $this->assertEquals($doc, $arg); return true; }))->once(); $luceneIndex->shouldReceive('find')->with(m::on(function ($arg) { $term = new MultiTerm(); $term->addTerm(new Term(1, 'primary_key'), true); $term->addTerm(new Term('12345', 'class_uid'), true); $this->assertEquals($term, $arg); return true; }))->andReturnUsing(function () { $hitMock = m::mock(); $hitMock->id = 10; return [$hitMock]; })->once(); $luceneIndex->shouldReceive('delete')->with(10)->once(); $index = $this->createIndex(); $index->update($this->model); }
/** * {@inheritdoc} */ public function addProduct(ProductInterface $product, $indexName = ProductIndexerInterface::DEFAULT_INDEX_NAME) { $index = $this->searchIndexManager->getIndex(ProductIndexerInterface::DEFAULT_INDEX_NAME); $document = new Document(); $document->addField(Field::unIndexed('identifier', $product->getId())); $document->addField(Field::text('name', $product->translate('en')->getName())); $document->addField(Field::text('shortDescription', $product->translate()->getShortDescription())); $document->addField(Field::text('description', $product->translate()->getDescription())); $index->addDocument($document); $index->commit(); }
public function createDocument(EntityInterface $entity) : Document { $document = new Document(); $context = $this->createContext($entity); $fieldsCollection = $context->getFieldsCollection(); $document->addField(Field::unIndexed('identifier', $entity->getId())); $fieldsCollection->map(function (SearchField $field) use($document) { $document->addField(Field::text($field->getName(), $field->getValue())); }); return $document; }
/** * Object constructor * * @param string $fileName * @param boolean $storeContent * @throws ExtensionNotLoadedException * @throws RuntimeException */ private function __construct($fileName, $storeContent) { if (!class_exists('ZipArchive', false)) { throw new ExtensionNotLoadedException('Open Document Text processing functionality requires Zip extension to be loaded'); } // Document data holders $documentHeadlines = array(); $documentParagraphs = array(); // Open OpenXML package $package = new \ZipArchive(); $package->open($fileName); // Read relations and search for officeDocument $content = $package->getFromName('content.xml'); if ($content === false) { throw new RuntimeException('Invalid archive or corrupted .odt file.'); } // Prevent php from loading remote resources $loadEntities = libxml_disable_entity_loader(true); $sxe = simplexml_load_string($content, 'SimpleXMLElement', LIBXML_NOBLANKS | LIBXML_COMPACT); // Restore entity loader state libxml_disable_entity_loader($loadEntities); foreach ($sxe->xpath('//text:h') as $headline) { $h = strip_tags($headline->asXML()); $documentHeadlines[] = $h; } foreach ($sxe->xpath('//text:p') as $paragraph) { $p = strip_tags($paragraph->asXML()); $documentParagraphs[] = $p; } // Read core properties $coreProperties = $this->extractMetaData($package); // Close file $package->close(); // Store contents if ($storeContent) { $this->addField(Field::Text('headlines', implode(' ', $documentHeadlines), 'UTF-8')); $this->addField(Field::Text('body', implode('', $documentParagraphs), 'UTF-8')); } else { $this->addField(Field::UnStored('headlines', implode(' ', $documentHeadlines), 'UTF-8')); $this->addField(Field::UnStored('body', implode('', $documentParagraphs), 'UTF-8')); } // Store meta data properties foreach ($coreProperties as $key => $value) { $this->addField(Field::Text($key, $value, 'UTF-8')); } // Store title (if not present in meta data) if (!isset($coreProperties['title'])) { $this->addField(Field::Text('title', $fileName, 'UTF-8')); } }
/** * @param $data * @param SearchIndexInterface $index * * @return IndexInterface */ public function index($data, SearchIndexInterface $index) { $this->unindex($data, $index); $indexDoc = new Document(); $indexDoc->addField(Field::Keyword('group_id', $data->id)); $indexDoc->addField(Field::UnIndexed('type', "group")); $indexDoc->addField(Field::UnIndexed('identifier', $data->url)); $indexDoc->addField(Field::UnIndexed('date_time', date('c'))); $indexDoc->addField(Field::UnIndexed('date', date('j. M. Y'))); $indexDoc->addField(Field::Text('title', $data->name_short, 'utf-8')); $indexDoc->addField(Field::Text('body', $data->description, 'utf-8')); $index->addDocument($indexDoc); return $this; }
/** * @param $data * @param SearchIndexInterface $index * * @return IndexInterface */ public function index($data, SearchIndexInterface $index) { $this->unindex($data, $index); $indexDoc = new Document(); $indexDoc->addField(Field::Keyword('news_id', $data->id)); $indexDoc->addField(Field::UnIndexed('type', "news")); $indexDoc->addField(Field::UnIndexed('identifier', $data->id)); $indexDoc->addField(Field::UnIndexed('date_time', $data->created_date->format('c'))); $indexDoc->addField(Field::UnIndexed('date', $data->created_date->format('j. M. Y'))); $indexDoc->addField(Field::Text('title', $data->title, 'utf-8')); $indexDoc->addField(Field::Text('body', $data->body, 'utf-8')); $index->addDocument($indexDoc); return $this; }
/** * Create or update an indexed document * * @param object $object */ public function index($object) { // create property accessor $accessor = PropertyAccess::createPropertyAccessor(); // delete existing documents with same id foreach ($this->index->find('id:' . $accessor->getValue($object, 'id')) as $hit) { $this->index->delete($hit->id); } // create new Lucene document $doc = new Document(); // add primary key to identify it in the search results $doc->addField(Field::keyword('id', $accessor->getValue($object, 'id'))); // add entity class reference to identify it in the search results $doc->addField(Field::unIndexed('entityClass', get_class($object))); // analyze each property's annotations to see which ones must be add to the document $reflClass = new ReflectionClass($object); foreach ($reflClass->getProperties() as $property) { $reflProperty = new \ReflectionProperty($object, $property->name); $annotation = $this->reader->getPropertyAnnotation($reflProperty, '\\Keratine\\Lucene\\Mapping\\Annotation\\DocumentField'); if ($annotation) { $value = $accessor->getValue($object, $property->name); $value = $this->ensureString($value); // use the appropriate indexing strategy for the field switch ($annotation->type) { case 'keyword': $doc->addField(Field::keyword($property->name, $value, 'UTF-8')); break; case 'unIndexed': $doc->addField(Field::unIndexed($property->name, $value, 'UTF-8')); break; case 'binary': $doc->addField(Field::binary($property->name, $value)); break; case 'text': $doc->addField(Field::text($property->name, $value, 'UTF-8')); break; case 'unStored': default: $doc->addField(Field::unStored($property->name, $value, 'UTF-8')); break; } } } // add the document to the index and commit it $this->index->addDocument($doc); $this->index->commit(); }
public function index() { $oldReqUri = $_SERVER['REQUEST_URI']; $_SERVER['REQUEST_URI'] = ''; $pageModel = new PageModel($this->indexer->getDB()); $elementModel = new ElementModel($this->indexer->getDB()); $searchModel = new SearchModel($this->indexer->getDB()); $stmntPages = $this->indexer->getDB()->prepare("\n\t\t\tSELECT p.ID, p.language_codeFK lang, p.title, p.description, r.pattern, p.role\n\t\t\tFROM page p\n\t\t\tLEFT JOIN route r ON r.page_IDFK = p.ID\n\t\t\tWHERE r.ID IS NOT NULL\n\t\t"); $resPages = $this->indexer->getDB()->select($stmntPages); $indexedPages = 0; foreach ($resPages as $p) { if ($p->role !== 'page') { echo " Skipped page #" . $p->ID . ": reason -> unusable role: " . $p->role . PHP_EOL; continue; } $searchIndexInterface = $this->indexer->getIndex($p->lang); // Index page echo " Indexing page #" . $p->ID . " into index \"" . $p->lang . "\": "; $cmsPage = $pageModel->getPageByID($p->ID); $elementTree = $elementModel->getElementTree($cmsPage); try { $searchableContent = $this->renderElementTreeRecursive($elementTree, $cmsPage->getLanguage()); } catch (\Exception $e) { echo " Error -> " . $e->getMessage() . "\n"; continue; } $searchDoc = new Document(); $searchDoc->setInternalID($p->ID); $searchDoc->setLanguage($p->lang); $searchDoc->setTitle($p->title); $searchDoc->setDescription($searchableContent); $searchDoc->setPath($p->pattern); $searchDoc->setType('core_page'); $docID = $searchModel->saveDocument($searchDoc); $luceneDocument = new \ZendSearch\Lucene\Document(); $luceneDocument->addField(Field::keyword('ID', $docID)); $luceneDocument->addField(Field::unStored('content', $searchableContent)); $luceneDocument->addField(Field::unStored('description', $p->description)); $searchIndexInterface->addDocument($luceneDocument); echo "done"; echo "\n"; ++$indexedPages; } $_SERVER['REQUEST_URI'] = $oldReqUri; echo " Total indexed pages: " . $indexedPages . "\n"; }
function testUpdate() { // preparation $app = new Application(); $container = $app->getContainer(); // get an index /** @var Index $index */ $index = $container->query('Index'); // add a document $doc = new Document(); $doc->addField(Document\Field::Keyword('fileId', '1')); $doc->addField(Document\Field::Text('path', '/somewhere/deep/down/the/rabbit/hole', 'UTF-8')); $doc->addField(Document\Field::Text('users', 'alice', 'UTF-8')); $index->index->addDocument($doc); $index->commit(); // search for it $idTerm = new Term('1', 'fileId'); $idQuery = new Query\Term($idTerm); $query = new Query\Boolean(); $query->addSubquery($idQuery); /** @var QueryHit $hit */ $hits = $index->find($query); // get the document from the query hit $foundDoc = $hits[0]->getDocument(); $this->assertEquals('alice', $foundDoc->getFieldValue('users')); // delete the document from the index //$index->index->delete($hit); // change the 'users' key of the document $foundDoc->addField(Document\Field::Text('users', 'bob', 'UTF-8')); $this->assertEquals('bob', $foundDoc->getFieldValue('users')); // add the document back to the index $index->updateFile($foundDoc, '1'); $idTerm2 = new Term('1', 'fileId'); $idQuery2 = new Query\Term($idTerm2); $query2 = new Query\Boolean(); $query2->addSubquery($idQuery2); /** @var QueryHit $hit */ $hits2 = $index->find($query2); // get the document from the query hit $foundDoc2 = $hits2[0]->getDocument(); $this->assertEquals('bob', $foundDoc2->getFieldValue('users')); }
function wavProcess($index, $doc, $documentPath) { $fileName = basename($documentPath); $fileNameField = \ZendSearch\Lucene\Document\Field::text('filename', $fileName); //APPEL DU FICHIER PYTHON exec('python ../xmpParse.py ' . $fileName, $res, $retcode); // Title $title = $res[0]; $titleField = \ZendSearch\Lucene\Document\Field::text('title', $title); // echo "Title : " . $title[0] . PHP_EOL; // Subject for ($i = 1; $i < count($res); $i++) { $subject = $res[$i]; $subjectField = \ZendSearch\Lucene\Document\Field::text($subject, $subject); $doc->addField($subjectField); } // echo "Subject : " . $subject[0] . PHP_EOL; $doc->addField($fileNameField); $doc->addField($titleField); $index->addDocument($doc); }
public function generateIndexAction() { $searchIndexLocation = $this->getIndexLocation(); $index = Lucene\Lucene::create($searchIndexLocation); $userTable = $this->getServiceLocator()->get('UserTable'); $uploadTable = $this->getServiceLocator()->get('UploadTable'); $allUploads = $uploadTable->fetchAll(); foreach ($allUploads as $fileUpload) { $uploadOwner = $userTable->getById($fileUpload->getUserId()); // создание полей lucene $fileUploadId = Document\Field::unIndexed('upload_id', $fileUpload->getId()); $label = Document\Field::Text('label', $fileUpload->getLabel()); $owner = Document\Field::Text('owner', $uploadOwner->getName()); $uploadPath = $this->getFileUploadLocation(); $fileName = $fileUpload->getFilename(); $filePath = $uploadPath . DIRECTORY_SEPARATOR . $fileName; if (substr_compare($fileName, ".xlsx", strlen($fileName) - strlen(".xlsx"), strlen(".xlsx")) === 0) { // Индексирование таблицы excel $indexDoc = Lucene\Document\Xlsx::loadXlsxFile($filePath); } else { if (substr_compare($fileName, ".docx", strlen($fileName) - strlen(".docx"), strlen(".docx")) === 0) { // Индексирование документа Word $indexDoc = Lucene\Document\Docx::loadDocxFile($filePath); } else { $indexDoc = new Lucene\Document(); } } // создание нового документа и добавление всех полей $indexDoc = new Lucene\Document(); $indexDoc->addField($label); $indexDoc->addField($owner); $indexDoc->addField($fileUploadId); $index->addDocument($indexDoc); } $index->commit(); $response = $this->getResponse(); $response->setContent("Index Ok"); return $response; }
/** * Lists all Post models. * @return mixed */ public function actionIndex() { $searchModel = new PostSearch(); $dataProvider = $searchModel->search(Yii::$app->request->post()); //setlocale(LC_ALL, 'en_US.UTF-8'); setlocale(LC_CTYPE, 'ru_RU.UTF-8'); //Lucene\Lucene::setDefaultSearchField('contents'); Lucene\Search\QueryParser::setDefaultEncoding('UTF-8'); Lucene\Analysis\Analyzer\Analyzer::setDefault(new Lucene\Analysis\Analyzer\Common\Utf8\CaseInsensitive()); Lucene\Lucene::setResultSetLimit(10); // create blog posts index located in /data/posts_index ,make sure the folder is writable $index = Lucene\Lucene::create('data/posts_index'); $posts = Post::find()->all(); //var_dump($posts);die(); // iterate through posts and build the index foreach ($posts as $p) { $doc = new Lucene\Document(); $doc->addField(Lucene\Document\Field::UnIndexed('entry_id', $p->id)); $doc->addField(Lucene\Document\Field::Keyword('title', $p->title)); $doc->addField(Lucene\Document\Field::text('contents', $p->content)); $index->addDocument($doc); } // commit the index $index->commit(); //Lucene\Analysis\Analyzer\Analyzer::setDefault(new Lucene\Analysis\Analyzer\Common\Utf8\CaseInsensitive()); // explode the search query to individual words $words = explode(' ', urldecode(Yii::$app->getRequest()->getQueryParam('q'))); // start a search query and add a term for each word to it $query = new Lucene\Search\Query\MultiTerm(); foreach ($words as $w) { $query->addTerm(new Lucene\Index\Term($w)); } // open and query the index $index = Lucene\Lucene::open('data/posts_index'); $results = $index->find($query); // the search results //var_dump($results); return $this->render('index', ['searchModel' => $searchModel, 'dataProvider' => $dataProvider, 'search' => $results, 'query' => $query]); }
/** * インデックスファイルを生成 */ public static function updateIndex() { if (empty(self::$igo)) { self::$igo = new Tagger(array('dict_dir' => LIB_DIR . 'ipadic', 'reduce_mode' => true)); } Analyzer::setDefault(new Utf8()); // 索引の作成 $index = Lucene::create(CACHE_DIR . self::INDEX_NAME); foreach (Listing::pages() as $page) { if (empty($page)) { continue; } $wiki = Factory::Wiki($page); // 読む権限がない場合スキップ if (!$wiki->isReadable() || $wiki->isHidden()) { continue; } /* // HTML出力 $html[] = '<html><head>'; $html[] = '<meta http-equiv="Content-type" content="text/html; charset=UTF-8"/>'; $html[] = '<title>' . $wiki->title() . '</title>'; $html[] = '</head>'; $html[] = '<body>' . $wiki->render() . '</body>'; $html[] = '</html>'; */ $doc = new LuceneDoc(); $doc->addField(Field::Text('title', $wiki->title())); // Store document URL to identify it in the search results $doc->addField(Field::Text('url', $wiki->uri())); // Index document contents //$contents = join(" ", self::$igo->wakati(strip_tags($wiki->render()))); $contents = strip_tags($wiki->render()); $doc->addField(Field::UnStored('contents', $contents)); // 索引へ文書の登録 $index->addDocument($doc); } $index->optimize(); }
/** * create document from configured fields within extracted data * @param string $url * @param array $page * @return Document */ protected function createDocument($url, $page) { $document = new Document(); if (!isset($page['status_code'])) { $page['status_code'] = 00; //tmp } setlocale(LC_ALL, "cs_CZ.UTF-8"); $document->addField(Field::keyword('url', $url)); // ancestor URLs to search by URL $urlParts = parse_url($url); if (isset($urlParts['path']) && $urlParts['path'] && strlen($urlParts['path']) > 1) { $uri = $urlParts['path']; $uris = array($uri); do { $uri = substr($uri, 0, strrpos($uri, '/')); $uris[] = $uri; } while (strrpos($uri, '/') > 1); $document->addField(Field::text(Page::URIS_KEY, implode(' ', $uris))); } foreach (array(Page::TITLE_KEY, Page::DESCRIPTION_KEY, Page::BODY_KEY, Page::IMAGE_KEY) as $fieldName) { $fieldValue = isset($page[$fieldName]) ? $page[$fieldName] : ''; switch ($fieldName) { case Page::TITLE_KEY: case Page::DESCRIPTION_KEY: case Page::BODY_KEY: $field = Field::text($fieldName, $fieldValue); // translit $fieldTranslit = Field::text($fieldName . '_translit', str_replace("'", '', iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $fieldValue))); $fieldTranslit->boost = isset($this->parameters[self::BOOST_PARAM][$fieldName]) ? $this->parameters[self::BOOST_PARAM][$fieldName] : 1.25; $document->addField($fieldTranslit); break; case Page::IMAGE_KEY: $field = Field::unIndexed($fieldName, $fieldValue); break; default: $translitValue = str_replace("'", '', iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $fieldValue)); $field = Field::text($fieldName, $fieldValue . ($translitValue != $fieldValue ? ' ' . $translitValue : '')); } $field->boost = isset($this->parameters[self::BOOST_PARAM][$fieldName]) ? $this->parameters[self::BOOST_PARAM][$fieldName] : 1.25; $document->addField($field); } // title tags as configured i.e. h1, h2, ... foreach ($this->parameters[self::TITLE_TAGS_PARAM] as $fieldName) { $fieldValue = Page::hasHeadlineType($page, $fieldName) ? Page::getHeadline($page, $fieldName) : ''; $field = Field::text($fieldName, $fieldValue); $field->boost = isset($this->parameters[self::BOOST_PARAM][$fieldName]) ? $this->parameters[self::BOOST_PARAM][$fieldName] : 1; $document->addField($field); $fieldTranslit = Field::text($fieldName . '_translit', str_replace("'", '', iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $fieldValue))); $fieldTranslit->boost = isset($this->parameters[self::BOOST_PARAM][$fieldName]) ? $this->parameters[self::BOOST_PARAM][$fieldName] : 1.25; $document->addField($fieldTranslit); } // page ID if selector defined if ($this->parameters[self::PAGE_ID_PARAM]) { $fieldValue = isset($page[Page::PAGE_ID_KEY]) ? $page[Page::PAGE_ID_KEY] : ''; $field = Field::unIndexed(Page::PAGE_ID_KEY, $fieldValue); $document->addField($field); } // route name if selector defined if ($this->parameters[self::ROUTE_NAME_PARAM]) { $fieldValue = isset($page[Page::ROUTE_NAME_KEY]) ? $page[Page::ROUTE_NAME_KEY] : ''; $field = Field::unIndexed(Page::ROUTE_NAME_KEY, $fieldValue); $document->addField($field); } return $document; }
/** * Object constructor * * @param string $fileName * @param boolean $storeContent * @throws \ZendSearch\Lucene\Exception\ExtensionNotLoadedException * @throws \ZendSearch\Lucene\Exception\RuntimeException */ private function __construct($fileName, $storeContent) { if (!class_exists('ZipArchive', false)) { throw new ExtensionNotLoadedException('MS Office documents processing functionality requires Zip extension to be loaded'); } // Document data holders $documentBody = array(); $coreProperties = array(); // Open AbstractOpenXML package $package = new \ZipArchive(); $package->open($fileName); // Read relations and search for officeDocument $relationsXml = $package->getFromName('_rels/.rels'); if ($relationsXml === false) { throw new RuntimeException('Invalid archive or corrupted .docx file.'); } $relations = XMLSecurity::scan($relationsXml); foreach ($relations->Relationship as $rel) { if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) { // Found office document! Read in contents... $contents = XMLSecurity::scan($package->getFromName($this->absoluteZipPath(dirname($rel['Target']) . '/' . basename($rel['Target'])))); $contents->registerXPathNamespace('w', self::SCHEMA_WORDPROCESSINGML); $paragraphs = $contents->xpath('//w:body/w:p'); foreach ($paragraphs as $paragraph) { $runs = $paragraph->xpath('.//w:r/*[name() = "w:t" or name() = "w:br"]'); if ($runs === false) { // Paragraph doesn't contain any text or breaks continue; } foreach ($runs as $run) { if ($run->getName() == 'br') { // Break element $documentBody[] = ' '; } else { $documentBody[] = (string) $run; } } // Add space after each paragraph. So they are not bound together. $documentBody[] = ' '; } break; } } // Read core properties $coreProperties = $this->extractMetaData($package); // Close file $package->close(); // Store filename $this->addField(Field::Text('filename', $fileName, 'UTF-8')); // Store contents if ($storeContent) { $this->addField(Field::Text('body', implode('', $documentBody), 'UTF-8')); } else { $this->addField(Field::UnStored('body', implode('', $documentBody), 'UTF-8')); } // Store meta data properties foreach ($coreProperties as $key => $value) { $this->addField(Field::Text($key, $value, 'UTF-8')); } // Store title (if not present in meta data) if (!isset($coreProperties['title'])) { $this->addField(Field::Text('title', $fileName, 'UTF-8')); } }
/** * Add index * @param integer $id * @param SearchCollection $index */ public function add($id, SearchCollection $index) { if ($this->config()->exists('zend_search', 'index')) { $document = new Document(); $document->addField(Field::keyword('id', $id)); foreach ($index as $field) { $document->addField($field); } $this->index()->addDocument($document); $this->index()->commit(); } }
/** * @group ZF-9680 */ public function testIsDeletedWithoutExplicitCommit() { $index = Lucene\Lucene::create(__DIR__ . '/_index/_files'); $document = new Document(); $document->addField(Document\Field::Keyword('_id', 'myId')); $document->addField(Document\Field::Keyword('bla', 'blubb')); $index->addDocument($document); $this->assertFalse($index->isDeleted(0)); }
public function index($indexer, $commit = true, $optimize = true) { $document = new Document(); $document->addField(Field::keyword('pk', $this->getId())); $document->addField(Field::Text('course', $this->getSubjectcode())); $document->addField(Field::Text('cross-listed', str_replace(array(';', ',', '|'), ' ', $this->getCrossListed()))); $document->addField(Field::Text('instructor', $this->getLegalContentOwner())); $document->addField(Field::Unstored('comments', $this->getComments())); $indexer->addDocument($document); if ($commit) { $indexer->commit(); } if ($optimize) { $indexer->optimize(); } }
/** * Object constructor * * @param string $data HTML string (may be HTML fragment, ) * @param boolean $isFile * @param boolean $storeContent * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. */ private function __construct($data, $isFile, $storeContent, $defaultEncoding = '') { $this->_doc = new \DOMDocument(); $this->_doc->substituteEntities = true; if ($isFile) { $htmlData = file_get_contents($data); } else { $htmlData = $data; } ErrorHandler::start(E_WARNING); $this->_doc->loadHTML($htmlData); ErrorHandler::stop(); if ($this->_doc->encoding === null) { // Document encoding is not recognized /** @todo improve HTML vs HTML fragment recognition */ if (preg_match('/<html>/i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) { // It's an HTML document // Add additional HEAD section and recognize document $htmlTagOffset = $matches[0][1] + strlen($matches[0][0]); ErrorHandler::start(E_WARNING); $this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset)) . '<head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head>' . iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset))); ErrorHandler::stop(); // Remove additional HEAD section $xpath = new \DOMXPath($this->_doc); $head = $xpath->query('/html/head')->item(0); $head->parentNode->removeChild($head); } else { // It's an HTML fragment ErrorHandler::start(E_WARNING); $this->_doc->loadHTML('<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>' . iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData) . '</body></html>'); ErrorHandler::stop(); } } /** @todo Add correction of wrong HTML encoding recognition processing * The case is: * Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used, * even $this->_doc->encoding demonstrates another recognized encoding */ $xpath = new \DOMXPath($this->_doc); $docTitle = ''; $titleNodes = $xpath->query('/html/head/title'); foreach ($titleNodes as $titleNode) { // title should always have only one entry, but we process all nodeset entries $docTitle .= $titleNode->nodeValue . ' '; } $this->addField(Field::Text('title', $docTitle, 'UTF-8')); $metaNodes = $xpath->query('/html/head/meta[@name]'); foreach ($metaNodes as $metaNode) { $this->addField(Field::Text($metaNode->getAttribute('name'), $metaNode->getAttribute('content'), 'UTF-8')); } $docBody = ''; $bodyNodes = $xpath->query('/html/body'); foreach ($bodyNodes as $bodyNode) { // body should always have only one entry, but we process all nodeset entries $this->_retrieveNodeText($bodyNode, $docBody); } if ($storeContent) { $this->addField(Field::Text('body', $docBody, 'UTF-8')); } else { $this->addField(Field::UnStored('body', $docBody, 'UTF-8')); } $linkNodes = $this->_doc->getElementsByTagName('a'); foreach ($linkNodes as $linkNode) { if (($href = $linkNode->getAttribute('href')) != '' && (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow')) { $this->_links[] = $href; } } $linkNodes = $this->_doc->getElementsByTagName('area'); foreach ($linkNodes as $linkNode) { if (($href = $linkNode->getAttribute('href')) != '' && (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow')) { $this->_links[] = $href; } } $this->_links = array_unique($this->_links); $linkNodes = $xpath->query('/html/head/link'); foreach ($linkNodes as $linkNode) { if (($href = $linkNode->getAttribute('href')) != '') { $this->_headerLinks[] = $href; } } $this->_headerLinks = array_unique($this->_headerLinks); }
public function addAetCommunicationToSearchIndex($index, Communication $aetCommunication) { // Create a new document $document = new Document(); $document->addField(Field::keyword('dbId', $aetCommunication->getId(), 'utf-8')); $document->addField(Field::unStored('title', $aetCommunication->getTitle(), 'utf-8')); $document->addField(Field::unStored('shortdesc', $aetCommunication->getShortDesc(), 'utf-8')); $document->addField(Field::unStored('body', html_entity_decode(strip_tags($aetCommunication->getBody()), ENT_SUBSTITUTE, 'UTF-8'), 'utf-8')); $document->addField(Field::unStored('author', $aetCommunication->getUser()->getFirstname() . " " . $aetCommunication->getUser()->getLastname(), 'utf-8')); // Add your document to the index $index->addDocument($document); // Commit your change $index->commit(); $index->optimize(); }
/** * @ORM\PostPersist */ public function updateLuceneIndex() { $index = self::getLuceneIndex(); // remove existing entries foreach ($index->find('pk:' . $this->getId()) as $hit) { $index->delete($hit->id); } // don't index expired and non-activated jobs if ($this->isExpired() || !$this->getIsActivated()) { return; } $doc = new Document(); // store job primary key to identify it in the search results $doc->addField(Document\Field::Keyword('pk', $this->getId())); // index job fields $doc->addField(Document\Field::UnStored('position', $this->getPosition(), 'utf-8')); $doc->addField(Document\Field::UnStored('company', $this->getCompany(), 'utf-8')); $doc->addField(Document\Field::UnStored('location', $this->getLocation(), 'utf-8')); $doc->addField(Document\Field::UnStored('description', $this->getDescription(), 'utf-8')); // add job to the index $index->addDocument($doc); $index->commit(); }
/** * Object constructor * * @param string $fileName * @param boolean $storeContent * @throws \ZendSearch\Lucene\Exception\ExtensionNotLoadedException * @throws \ZendSearch\Lucene\Exception\RuntimeException */ private function __construct($fileName, $storeContent) { if (!class_exists('ZipArchive', false)) { throw new ExtensionNotLoadedException('MS Office documents processing functionality requires Zip extension to be loaded'); } // Document data holders $sharedStrings = array(); $worksheets = array(); $documentBody = array(); $coreProperties = array(); // Open AbstractOpenXML package $package = new \ZipArchive(); $package->open($fileName); // Read relations and search for officeDocument $relationsXml = $package->getFromName('_rels/.rels'); if ($relationsXml === false) { throw new RuntimeException('Invalid archive or corrupted .xlsx file.'); } $relations = XmlSecurity::scan($relationsXml); foreach ($relations->Relationship as $rel) { if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) { // Found office document! Read relations for workbook... $workbookRelations = XmlSecurity::scan($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels"))); $workbookRelations->registerXPathNamespace("rel", AbstractOpenXML::SCHEMA_RELATIONSHIP); // Read shared strings $sharedStringsPath = $workbookRelations->xpath("rel:Relationship[@Type='" . self::SCHEMA_SHAREDSTRINGS . "']"); $sharedStringsPath = (string) $sharedStringsPath[0]['Target']; $xmlStrings = XmlSecurity::scan($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . $sharedStringsPath))); if (isset($xmlStrings) && isset($xmlStrings->si)) { foreach ($xmlStrings->si as $val) { if (isset($val->t)) { $sharedStrings[] = (string) $val->t; } elseif (isset($val->r)) { $sharedStrings[] = $this->_parseRichText($val); } } } // Loop relations for workbook and extract worksheets... foreach ($workbookRelations->Relationship as $workbookRelation) { if ($workbookRelation["Type"] == self::SCHEMA_WORKSHEETRELATION) { $worksheets[str_replace('rId', '', (string) $workbookRelation["Id"])] = XmlSecurity::scan($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($workbookRelation["Target"]) . "/" . basename($workbookRelation["Target"])))); } } break; } } // Sort worksheets ksort($worksheets); // Extract contents from worksheets foreach ($worksheets as $sheetKey => $worksheet) { foreach ($worksheet->sheetData->row as $row) { foreach ($row->c as $c) { // Determine data type $dataType = (string) $c["t"]; switch ($dataType) { case "s": // Value is a shared string if ((string) $c->v != '') { $value = $sharedStrings[intval($c->v)]; } else { $value = ''; } break; case "b": // Value is boolean $value = (string) $c->v; if ($value == '0') { $value = false; } elseif ($value == '1') { $value = true; } else { $value = (bool) $c->v; } break; case "inlineStr": // Value is rich text inline $value = $this->_parseRichText($c->is); break; case "e": // Value is an error message if ((string) $c->v != '') { $value = (string) $c->v; } else { $value = ''; } break; default: // Value is a string $value = (string) $c->v; // Check for numeric values if (is_numeric($value) && $dataType != 's') { if ($value == (int) $value) { $value = (int) $value; } elseif ($value == (double) $value) { $value = (double) $value; } elseif ($value == (double) $value) { $value = (double) $value; } } } $documentBody[] = $value; } } } // Read core properties $coreProperties = $this->extractMetaData($package); // Close file $package->close(); // Store filename $this->addField(Field::Text('filename', $fileName, 'UTF-8')); // Store contents if ($storeContent) { $this->addField(Field::Text('body', implode(' ', $documentBody), 'UTF-8')); } else { $this->addField(Field::UnStored('body', implode(' ', $documentBody), 'UTF-8')); } // Store meta data properties foreach ($coreProperties as $key => $value) { $this->addField(Field::Text($key, $value, 'UTF-8')); } // Store title (if not present in meta data) if (!isset($coreProperties['title'])) { $this->addField(Field::Text('title', $fileName, 'UTF-8')); } }
/** * Object constructor * * @param string $fileName * @param boolean $storeContent * @throws \ZendSearch\Lucene\Exception\ExtensionNotLoadedException * @throws \ZendSearch\Lucene\Exception\RuntimeException */ private function __construct($fileName, $storeContent) { if (!class_exists('ZipArchive', false)) { throw new ExtensionNotLoadedException('MS Office documents processing functionality requires Zip extension to be loaded'); } // Document data holders $slides = array(); $slideNotes = array(); $documentBody = array(); $coreProperties = array(); // Open AbstractOpenXML package $package = new \ZipArchive(); $package->open($fileName); // Read relations and search for officeDocument $relationsXml = $package->getFromName('_rels/.rels'); if ($relationsXml === false) { throw new RuntimeException('Invalid archive or corrupted .pptx file.'); } $relations = simplexml_load_string($relationsXml); foreach ($relations->Relationship as $rel) { if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) { // Found office document! Search for slides... $slideRelations = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels"))); foreach ($slideRelations->Relationship as $slideRel) { if ($slideRel["Type"] == self::SCHEMA_SLIDERELATION) { // Found slide! $slides[str_replace('rId', '', (string) $slideRel["Id"])] = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . basename($slideRel["Target"])))); // Search for slide notes $slideNotesRelations = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/_rels/" . basename($slideRel["Target"]) . ".rels"))); foreach ($slideNotesRelations->Relationship as $slideNoteRel) { if ($slideNoteRel["Type"] == self::SCHEMA_SLIDENOTESRELATION) { // Found slide notes! $slideNotes[str_replace('rId', '', (string) $slideRel["Id"])] = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . dirname($slideNoteRel["Target"]) . "/" . basename($slideNoteRel["Target"])))); break; } } } } break; } } // Sort slides ksort($slides); ksort($slideNotes); // Extract contents from slides foreach ($slides as $slideKey => $slide) { // Register namespaces $slide->registerXPathNamespace("p", self::SCHEMA_PRESENTATIONML); $slide->registerXPathNamespace("a", self::SCHEMA_DRAWINGML); // Fetch all text $textElements = $slide->xpath('//a:t'); foreach ($textElements as $textElement) { $documentBody[] = (string) $textElement; } // Extract contents from slide notes if (isset($slideNotes[$slideKey])) { // Fetch slide note $slideNote = $slideNotes[$slideKey]; // Register namespaces $slideNote->registerXPathNamespace("p", self::SCHEMA_PRESENTATIONML); $slideNote->registerXPathNamespace("a", self::SCHEMA_DRAWINGML); // Fetch all text $textElements = $slideNote->xpath('//a:t'); foreach ($textElements as $textElement) { $documentBody[] = (string) $textElement; } } } // Read core properties $coreProperties = $this->extractMetaData($package); // Close file $package->close(); // Store filename $this->addField(Field::Text('filename', $fileName, 'UTF-8')); // Store contents if ($storeContent) { $this->addField(Field::Text('body', implode(' ', $documentBody), 'UTF-8')); } else { $this->addField(Field::UnStored('body', implode(' ', $documentBody), 'UTF-8')); } // Store meta data properties foreach ($coreProperties as $key => $value) { $this->addField(Field::Text($key, $value, 'UTF-8')); } // Store title (if not present in meta data) if (!isset($coreProperties['title'])) { $this->addField(Field::Text('title', $fileName, 'UTF-8')); } }
public function generateSearchAction() { $searchIndexLocation = $this->getIndexLocation(); $index = Lucene\Lucene::create($searchIndexLocation); $allUsers = $this->getUserTable()->fetchAll(false); foreach ($allUsers as $user) { $id = Document\Field::keyword('userId', $user->userId); $firstName = Document\Field::text('firstName', $user->firstName); $lastName = Document\Field::text('lastName', $user->lastName); $email = Document\Field::text('email', $user->email); $role = Document\Field::text('role', $user->role); $activated = Document\Field::keyword('activated', $user->activated); $indexDoc = new Lucene\Document(); $indexDoc->addField($id); $indexDoc->addField($firstName); $indexDoc->addField($lastName); $indexDoc->addField($email); $indexDoc->addField($role); $indexDoc->addField($activated); $index->addDocument($indexDoc); } $index->commit(); }
/** * Update document in index for model * * @param Model $model */ public function update(Model $model) { // Remove any existing documents for model. $this->delete($model); // Create new document for model. $doc = new Document(); list($name, $value) = $this->config->primaryKeyPair($model); // Add private key. $doc->addField(Field::keyword($name, $value)); // Add model's class UID. list($name, $value) = $this->config->classUidPair($model); // Add class uid for identification of model's class. $doc->addField(Field::Keyword($name, $value)); // Get base fields. $fields = $this->config->fields($model); // Add fields to document to be indexed (but not stored). foreach ($fields as $fieldName => $options) { $fieldValue = $model->{trim($fieldName)}; $field = Field::unStored(trim($fieldName), strip_tags(trim($fieldValue))); $field->boost = array_get($options, 'boost'); $doc->addField($field); } // Get dynamic fields. $optionalAttributes = $this->config->optionalAttributes($model); // Add optional attributes to document to be indexed (but not stored). foreach ($optionalAttributes as $fieldName => $options) { $fieldValue = array_get($options, "value"); $field = Field::unStored(trim($fieldName), strip_tags(trim($fieldValue))); $field->boost = array_get($options, "boost"); $doc->addField($field); } // Set boost for model. $doc->boost = $this->config->boost($model); // Add document to index. $this->index()->addDocument($doc); }
/** * Indexa dados nos arquivos de json */ public function index() { $dir = realpath(dirname(__FILE__)) . DIRECTORY_SEPARATOR . "data" . DIRECTORY_SEPARATOR; $jsonDir = $dir . "json"; $indexDir = $dir . "index"; // ler aquivos json $files = scandir($jsonDir); foreach ($files as $file) { if ($file == '.' || $file == '..') { continue; } // Se arquivo existe if (is_file($jsonDir . DIRECTORY_SEPARATOR . $file)) { $json = json_decode(file_get_contents($jsonDir . DIRECTORY_SEPARATOR . $file)); $indexName = substr($file, 0, -5); // Cria index $index = Lucene\Lucene::create($indexDir . DIRECTORY_SEPARATOR . $indexName); // Cria documento e define campos para indexar foreach ($json as $entry) { $doc = new Lucene\Document(); $doc->addField(Lucene\Document\Field::Text('url', $entry->title)); $doc->addField(Lucene\Document\Field::UnStored('contents', $entry->text)); $index->addDocument($doc); } } } }