コード例 #1
  * Object constructor
  * @param string  $data
  * @param boolean $storeContent
  * @throws NotIndexedException
 private function __construct($data, $storeContent)
     //TODO check PDF >1.5 metadata extraction
     //do the content extraction
     $parser = new Parser();
     try {
         $pdf = $parser->parseContent($data);
         $body = $pdf->getText();
         // Store contents
         if ($storeContent) {
             $this->addField(Document\Field::Text('body', $body, 'UTF-8'));
         } else {
             $this->addField(Document\Field::UnStored('body', $body, 'UTF-8'));
         $details = $pdf->getDetails();
         // Store meta data properties
         foreach ($details as $key => $value) {
             $key = strtolower($key);
             if ($key === 'author') {
                 $key = 'creator';
             $this->addField(Document\Field::Text($key, $value, 'UTF-8'));
     } catch (\Exception $ex) {
         throw new NotIndexedException(null, null, $ex);
コード例 #2
 public function add(Searchable $obj)
     // Get Primary Key
     $attributes = $obj->getSearchAttributes();
     $index = $this->getIndex();
     $doc = new \ZendSearch\Lucene\Document();
     // Add Meta Data fields
     foreach ($this->getMetaInfoArray($obj) as $fieldName => $fieldValue) {
         $doc->addField(\ZendSearch\Lucene\Document\Field::keyword($fieldName, $fieldValue));
     // Add provided search infos
     foreach ($attributes as $key => $val) {
         $doc->addField(\ZendSearch\Lucene\Document\Field::Text($key, $val, 'UTF-8'));
     // Add comments - if record is content
     if ($obj instanceof ContentActiveRecord) {
         $comments = "";
         foreach (Comment::findAll(['object_id' => $obj->getPrimaryKey(), 'object_model' => $obj->className()]) as $comment) {
             $comments .= " " . $comment->message;
         $doc->addField(\ZendSearch\Lucene\Document\Field::Text('comments', $comments, 'UTF-8'));
     if (\Yii::$app->request->isConsoleRequest) {
         print ".";
コード例 #3
 public function testAddFieldMethodChaining()
     $document = new Document();
     $this->assertTrue($document->addField(Document\Field::Text('title', 'Title')) instanceof Document);
     $document = new Document();
     $document->addField(Document\Field::Text('title', 'Title'))->addField(Document\Field::Text('annotation', 'Annotation'))->addField(Document\Field::Text('body', 'Document body, document body, document body...'));
コード例 #4
  * @dataProvider searchResultDataProvider
 function testSearchLuceneResultContent($fileId, $name, $path, $size, $score, $mimeType, $modified, $container)
     require_once __DIR__ . '/util/dummyindex.php';
     $index = new DummyIndex();
     $doc = new Document();
     $doc->addField(Document\Field::Keyword('fileId', $fileId));
     $doc->addField(Document\Field::Text('path', '/test/files' . $path, 'UTF-8'));
     $doc->addField(Document\Field::unIndexed('mtime', $modified));
     $doc->addField(Document\Field::unIndexed('size', $size));
     $doc->addField(Document\Field::unIndexed('mimetype', $mimeType));
     $hit = new QueryHit($index);
     $hit->score = $score;
     $hit->id = 0;
     $hit->document_id = 0;
     $searchResult = new \OCA\Search_Lucene\Search\LuceneResult($hit);
     $this->assertInstanceOf('OCA\\Search_Lucene\\Search\\LuceneResult', $searchResult);
     $this->assertEquals($fileId, $searchResult->id);
     $this->assertEquals('lucene', $searchResult->type);
     $this->assertEquals($path, $searchResult->path);
     $this->assertEquals($name, $searchResult->name);
     $this->assertEquals($mimeType, $searchResult->mime_type);
     $this->assertEquals($size, $searchResult->size);
     $this->assertEquals($score, $searchResult->score);
     $this->assertEquals($modified, $searchResult->modified);
コード例 #5
 public function testUpdate()
     $this->connection->shouldReceive('getIndex')->andReturn($luceneIndex = m::mock());
     $luceneIndex->shouldReceive('addDocument')->with(m::on(function ($arg) {
         $doc = new Document();
         $doc->addField(Field::keyword('primary_key', 1));
         $doc->addField(Field::Keyword('class_uid', '12345'));
         $field = Field::unStored('name', 'test name');
         $field->boost = 1;
         $field = Field::unStored('optional_attribute1', 'optional value');
         $field->boost = 1;
         $this->assertEquals($doc, $arg);
         return true;
     $luceneIndex->shouldReceive('find')->with(m::on(function ($arg) {
         $term = new MultiTerm();
         $term->addTerm(new Term(1, 'primary_key'), true);
         $term->addTerm(new Term('12345', 'class_uid'), true);
         $this->assertEquals($term, $arg);
         return true;
     }))->andReturnUsing(function () {
         $hitMock = m::mock();
         $hitMock->id = 10;
         return [$hitMock];
     $index = $this->createIndex();
コード例 #6
  * {@inheritdoc}
 public function addProduct(ProductInterface $product, $indexName = ProductIndexerInterface::DEFAULT_INDEX_NAME)
     $index = $this->searchIndexManager->getIndex(ProductIndexerInterface::DEFAULT_INDEX_NAME);
     $document = new Document();
     $document->addField(Field::unIndexed('identifier', $product->getId()));
     $document->addField(Field::text('name', $product->translate('en')->getName()));
     $document->addField(Field::text('shortDescription', $product->translate()->getShortDescription()));
     $document->addField(Field::text('description', $product->translate()->getDescription()));
コード例 #7
 public function createDocument(EntityInterface $entity) : Document
     $document = new Document();
     $context = $this->createContext($entity);
     $fieldsCollection = $context->getFieldsCollection();
     $document->addField(Field::unIndexed('identifier', $entity->getId()));
     $fieldsCollection->map(function (SearchField $field) use($document) {
         $document->addField(Field::text($field->getName(), $field->getValue()));
     return $document;
コード例 #8
  * Object constructor
  * @param string $fileName
  * @param boolean $storeContent
  * @throws ExtensionNotLoadedException
  * @throws RuntimeException
 private function __construct($fileName, $storeContent)
     if (!class_exists('ZipArchive', false)) {
         throw new ExtensionNotLoadedException('Open Document Text processing functionality requires Zip extension to be loaded');
     // Document data holders
     $documentHeadlines = array();
     $documentParagraphs = array();
     // Open OpenXML package
     $package = new \ZipArchive();
     // Read relations and search for officeDocument
     $content = $package->getFromName('content.xml');
     if ($content === false) {
         throw new RuntimeException('Invalid archive or corrupted .odt file.');
     // Prevent php from loading remote resources
     $loadEntities = libxml_disable_entity_loader(true);
     $sxe = simplexml_load_string($content, 'SimpleXMLElement', LIBXML_NOBLANKS | LIBXML_COMPACT);
     // Restore entity loader state
     foreach ($sxe->xpath('//text:h') as $headline) {
         $h = strip_tags($headline->asXML());
         $documentHeadlines[] = $h;
     foreach ($sxe->xpath('//text:p') as $paragraph) {
         $p = strip_tags($paragraph->asXML());
         $documentParagraphs[] = $p;
     // Read core properties
     $coreProperties = $this->extractMetaData($package);
     // Close file
     // Store contents
     if ($storeContent) {
         $this->addField(Field::Text('headlines', implode(' ', $documentHeadlines), 'UTF-8'));
         $this->addField(Field::Text('body', implode('', $documentParagraphs), 'UTF-8'));
     } else {
         $this->addField(Field::UnStored('headlines', implode(' ', $documentHeadlines), 'UTF-8'));
         $this->addField(Field::UnStored('body', implode('', $documentParagraphs), 'UTF-8'));
     // Store meta data properties
     foreach ($coreProperties as $key => $value) {
         $this->addField(Field::Text($key, $value, 'UTF-8'));
     // Store title (if not present in meta data)
     if (!isset($coreProperties['title'])) {
         $this->addField(Field::Text('title', $fileName, 'UTF-8'));
コード例 #9
  * @param $data
  * @param SearchIndexInterface $index
  * @return IndexInterface
 public function index($data, SearchIndexInterface $index)
     $this->unindex($data, $index);
     $indexDoc = new Document();
     $indexDoc->addField(Field::Keyword('group_id', $data->id));
     $indexDoc->addField(Field::UnIndexed('type', "group"));
     $indexDoc->addField(Field::UnIndexed('identifier', $data->url));
     $indexDoc->addField(Field::UnIndexed('date_time', date('c')));
     $indexDoc->addField(Field::UnIndexed('date', date('j. M. Y')));
     $indexDoc->addField(Field::Text('title', $data->name_short, 'utf-8'));
     $indexDoc->addField(Field::Text('body', $data->description, 'utf-8'));
     return $this;
コード例 #10
  * @param $data
  * @param SearchIndexInterface $index
  * @return IndexInterface
 public function index($data, SearchIndexInterface $index)
     $this->unindex($data, $index);
     $indexDoc = new Document();
     $indexDoc->addField(Field::Keyword('news_id', $data->id));
     $indexDoc->addField(Field::UnIndexed('type', "news"));
     $indexDoc->addField(Field::UnIndexed('identifier', $data->id));
     $indexDoc->addField(Field::UnIndexed('date_time', $data->created_date->format('c')));
     $indexDoc->addField(Field::UnIndexed('date', $data->created_date->format('j. M. Y')));
     $indexDoc->addField(Field::Text('title', $data->title, 'utf-8'));
     $indexDoc->addField(Field::Text('body', $data->body, 'utf-8'));
     return $this;
コード例 #11
  * Create or update an indexed document
  * @param object $object
 public function index($object)
     // create property accessor
     $accessor = PropertyAccess::createPropertyAccessor();
     // delete existing documents with same id
     foreach ($this->index->find('id:' . $accessor->getValue($object, 'id')) as $hit) {
     // create new Lucene document
     $doc = new Document();
     // add primary key to identify it in the search results
     $doc->addField(Field::keyword('id', $accessor->getValue($object, 'id')));
     // add entity class reference to identify it in the search results
     $doc->addField(Field::unIndexed('entityClass', get_class($object)));
     // analyze each property's annotations to see which ones must be add to the document
     $reflClass = new ReflectionClass($object);
     foreach ($reflClass->getProperties() as $property) {
         $reflProperty = new \ReflectionProperty($object, $property->name);
         $annotation = $this->reader->getPropertyAnnotation($reflProperty, '\\Keratine\\Lucene\\Mapping\\Annotation\\DocumentField');
         if ($annotation) {
             $value = $accessor->getValue($object, $property->name);
             $value = $this->ensureString($value);
             // use the appropriate indexing strategy for the field
             switch ($annotation->type) {
                 case 'keyword':
                     $doc->addField(Field::keyword($property->name, $value, 'UTF-8'));
                 case 'unIndexed':
                     $doc->addField(Field::unIndexed($property->name, $value, 'UTF-8'));
                 case 'binary':
                     $doc->addField(Field::binary($property->name, $value));
                 case 'text':
                     $doc->addField(Field::text($property->name, $value, 'UTF-8'));
                 case 'unStored':
                     $doc->addField(Field::unStored($property->name, $value, 'UTF-8'));
     // add the document to the index and commit it
コード例 #12
 public function index()
     $oldReqUri = $_SERVER['REQUEST_URI'];
     $_SERVER['REQUEST_URI'] = '';
     $pageModel = new PageModel($this->indexer->getDB());
     $elementModel = new ElementModel($this->indexer->getDB());
     $searchModel = new SearchModel($this->indexer->getDB());
     $stmntPages = $this->indexer->getDB()->prepare("\n\t\t\tSELECT p.ID, p.language_codeFK lang, p.title, p.description, r.pattern, p.role\n\t\t\tFROM page p\n\t\t\tLEFT JOIN route r ON r.page_IDFK = p.ID\n\t\t\tWHERE r.ID IS NOT NULL\n\t\t");
     $resPages = $this->indexer->getDB()->select($stmntPages);
     $indexedPages = 0;
     foreach ($resPages as $p) {
         if ($p->role !== 'page') {
             echo "  Skipped page #" . $p->ID . ": reason -> unusable role: " . $p->role . PHP_EOL;
         $searchIndexInterface = $this->indexer->getIndex($p->lang);
         // Index page
         echo "  Indexing page #" . $p->ID . " into index \"" . $p->lang . "\": ";
         $cmsPage = $pageModel->getPageByID($p->ID);
         $elementTree = $elementModel->getElementTree($cmsPage);
         try {
             $searchableContent = $this->renderElementTreeRecursive($elementTree, $cmsPage->getLanguage());
         } catch (\Exception $e) {
             echo " Error -> " . $e->getMessage() . "\n";
         $searchDoc = new Document();
         $docID = $searchModel->saveDocument($searchDoc);
         $luceneDocument = new \ZendSearch\Lucene\Document();
         $luceneDocument->addField(Field::keyword('ID', $docID));
         $luceneDocument->addField(Field::unStored('content', $searchableContent));
         $luceneDocument->addField(Field::unStored('description', $p->description));
         echo "done";
         echo "\n";
     $_SERVER['REQUEST_URI'] = $oldReqUri;
     echo "  Total indexed pages: " . $indexedPages . "\n";
コード例 #13
 function testUpdate()
     // preparation
     $app = new Application();
     $container = $app->getContainer();
     // get an index
     /** @var Index $index */
     $index = $container->query('Index');
     // add a document
     $doc = new Document();
     $doc->addField(Document\Field::Keyword('fileId', '1'));
     $doc->addField(Document\Field::Text('path', '/somewhere/deep/down/the/rabbit/hole', 'UTF-8'));
     $doc->addField(Document\Field::Text('users', 'alice', 'UTF-8'));
     // search for it
     $idTerm = new Term('1', 'fileId');
     $idQuery = new Query\Term($idTerm);
     $query = new Query\Boolean();
     /** @var QueryHit $hit */
     $hits = $index->find($query);
     // get the document from the query hit
     $foundDoc = $hits[0]->getDocument();
     $this->assertEquals('alice', $foundDoc->getFieldValue('users'));
     // delete the document from the index
     // change the 'users' key of the document
     $foundDoc->addField(Document\Field::Text('users', 'bob', 'UTF-8'));
     $this->assertEquals('bob', $foundDoc->getFieldValue('users'));
     // add the document back to the index
     $index->updateFile($foundDoc, '1');
     $idTerm2 = new Term('1', 'fileId');
     $idQuery2 = new Query\Term($idTerm2);
     $query2 = new Query\Boolean();
     /** @var QueryHit $hit */
     $hits2 = $index->find($query2);
     // get the document from the query hit
     $foundDoc2 = $hits2[0]->getDocument();
     $this->assertEquals('bob', $foundDoc2->getFieldValue('users'));
コード例 #14
function wavProcess($index, $doc, $documentPath)
    $fileName = basename($documentPath);
    $fileNameField = \ZendSearch\Lucene\Document\Field::text('filename', $fileName);
    exec('python ../xmpParse.py ' . $fileName, $res, $retcode);
    // Title
    $title = $res[0];
    $titleField = \ZendSearch\Lucene\Document\Field::text('title', $title);
    // echo "Title : " . $title[0] . PHP_EOL;
    // Subject
    for ($i = 1; $i < count($res); $i++) {
        $subject = $res[$i];
        $subjectField = \ZendSearch\Lucene\Document\Field::text($subject, $subject);
    // echo "Subject : " . $subject[0] . PHP_EOL;
コード例 #15
 public function generateIndexAction()
     $searchIndexLocation = $this->getIndexLocation();
     $index = Lucene\Lucene::create($searchIndexLocation);
     $userTable = $this->getServiceLocator()->get('UserTable');
     $uploadTable = $this->getServiceLocator()->get('UploadTable');
     $allUploads = $uploadTable->fetchAll();
     foreach ($allUploads as $fileUpload) {
         $uploadOwner = $userTable->getById($fileUpload->getUserId());
         // создание полей lucene
         $fileUploadId = Document\Field::unIndexed('upload_id', $fileUpload->getId());
         $label = Document\Field::Text('label', $fileUpload->getLabel());
         $owner = Document\Field::Text('owner', $uploadOwner->getName());
         $uploadPath = $this->getFileUploadLocation();
         $fileName = $fileUpload->getFilename();
         $filePath = $uploadPath . DIRECTORY_SEPARATOR . $fileName;
         if (substr_compare($fileName, ".xlsx", strlen($fileName) - strlen(".xlsx"), strlen(".xlsx")) === 0) {
             // Индексирование таблицы excel
             $indexDoc = Lucene\Document\Xlsx::loadXlsxFile($filePath);
         } else {
             if (substr_compare($fileName, ".docx", strlen($fileName) - strlen(".docx"), strlen(".docx")) === 0) {
                 // Индексирование документа Word
                 $indexDoc = Lucene\Document\Docx::loadDocxFile($filePath);
             } else {
                 $indexDoc = new Lucene\Document();
         // создание нового документа и добавление всех полей
         $indexDoc = new Lucene\Document();
     $response = $this->getResponse();
     $response->setContent("Index Ok");
     return $response;
コード例 #16
  * Lists all Post models.
  * @return mixed
 public function actionIndex()
     $searchModel = new PostSearch();
     $dataProvider = $searchModel->search(Yii::$app->request->post());
     //setlocale(LC_ALL, 'en_US.UTF-8');
     setlocale(LC_CTYPE, 'ru_RU.UTF-8');
     Lucene\Analysis\Analyzer\Analyzer::setDefault(new Lucene\Analysis\Analyzer\Common\Utf8\CaseInsensitive());
     // create blog posts index located in /data/posts_index ,make sure the folder is writable
     $index = Lucene\Lucene::create('data/posts_index');
     $posts = Post::find()->all();
     // iterate through posts and build the index
     foreach ($posts as $p) {
         $doc = new Lucene\Document();
         $doc->addField(Lucene\Document\Field::UnIndexed('entry_id', $p->id));
         $doc->addField(Lucene\Document\Field::Keyword('title', $p->title));
         $doc->addField(Lucene\Document\Field::text('contents', $p->content));
     // commit the index
     //Lucene\Analysis\Analyzer\Analyzer::setDefault(new Lucene\Analysis\Analyzer\Common\Utf8\CaseInsensitive());
     // explode the search query to individual words
     $words = explode(' ', urldecode(Yii::$app->getRequest()->getQueryParam('q')));
     // start a search query and add a term for each word to it
     $query = new Lucene\Search\Query\MultiTerm();
     foreach ($words as $w) {
         $query->addTerm(new Lucene\Index\Term($w));
     // open and query the index
     $index = Lucene\Lucene::open('data/posts_index');
     $results = $index->find($query);
     // the search results
     return $this->render('index', ['searchModel' => $searchModel, 'dataProvider' => $dataProvider, 'search' => $results, 'query' => $query]);
コード例 #17
  * インデックスファイルを生成
 public static function updateIndex()
     if (empty(self::$igo)) {
         self::$igo = new Tagger(array('dict_dir' => LIB_DIR . 'ipadic', 'reduce_mode' => true));
     Analyzer::setDefault(new Utf8());
     // 索引の作成
     $index = Lucene::create(CACHE_DIR . self::INDEX_NAME);
     foreach (Listing::pages() as $page) {
         if (empty($page)) {
         $wiki = Factory::Wiki($page);
         // 読む権限がない場合スキップ
         if (!$wiki->isReadable() || $wiki->isHidden()) {
         			// HTML出力
         			$html[] = '<html><head>';
         			$html[] = '<meta http-equiv="Content-type" content="text/html; charset=UTF-8"/>';
         			$html[] = '<title>' . $wiki->title() . '</title>';
         			$html[] = '</head>';
         			$html[] = '<body>' . $wiki->render() . '</body>';
         			$html[] = '</html>';
         $doc = new LuceneDoc();
         $doc->addField(Field::Text('title', $wiki->title()));
         // Store document URL to identify it in the search results
         $doc->addField(Field::Text('url', $wiki->uri()));
         // Index document contents
         //$contents = join(" ", self::$igo->wakati(strip_tags($wiki->render())));
         $contents = strip_tags($wiki->render());
         $doc->addField(Field::UnStored('contents', $contents));
         // 索引へ文書の登録
コード例 #18
  * create document from configured fields within extracted data
  * @param string $url
  * @param array $page
  * @return Document
 protected function createDocument($url, $page)
     $document = new Document();
     if (!isset($page['status_code'])) {
         $page['status_code'] = 00;
     setlocale(LC_ALL, "cs_CZ.UTF-8");
     $document->addField(Field::keyword('url', $url));
     // ancestor URLs to search by URL
     $urlParts = parse_url($url);
     if (isset($urlParts['path']) && $urlParts['path'] && strlen($urlParts['path']) > 1) {
         $uri = $urlParts['path'];
         $uris = array($uri);
         do {
             $uri = substr($uri, 0, strrpos($uri, '/'));
             $uris[] = $uri;
         } while (strrpos($uri, '/') > 1);
         $document->addField(Field::text(Page::URIS_KEY, implode(' ', $uris)));
     foreach (array(Page::TITLE_KEY, Page::DESCRIPTION_KEY, Page::BODY_KEY, Page::IMAGE_KEY) as $fieldName) {
         $fieldValue = isset($page[$fieldName]) ? $page[$fieldName] : '';
         switch ($fieldName) {
             case Page::TITLE_KEY:
             case Page::DESCRIPTION_KEY:
             case Page::BODY_KEY:
                 $field = Field::text($fieldName, $fieldValue);
                 // translit
                 $fieldTranslit = Field::text($fieldName . '_translit', str_replace("'", '', iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $fieldValue)));
                 $fieldTranslit->boost = isset($this->parameters[self::BOOST_PARAM][$fieldName]) ? $this->parameters[self::BOOST_PARAM][$fieldName] : 1.25;
             case Page::IMAGE_KEY:
                 $field = Field::unIndexed($fieldName, $fieldValue);
                 $translitValue = str_replace("'", '', iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $fieldValue));
                 $field = Field::text($fieldName, $fieldValue . ($translitValue != $fieldValue ? ' ' . $translitValue : ''));
         $field->boost = isset($this->parameters[self::BOOST_PARAM][$fieldName]) ? $this->parameters[self::BOOST_PARAM][$fieldName] : 1.25;
     // title tags as configured i.e. h1, h2, ...
     foreach ($this->parameters[self::TITLE_TAGS_PARAM] as $fieldName) {
         $fieldValue = Page::hasHeadlineType($page, $fieldName) ? Page::getHeadline($page, $fieldName) : '';
         $field = Field::text($fieldName, $fieldValue);
         $field->boost = isset($this->parameters[self::BOOST_PARAM][$fieldName]) ? $this->parameters[self::BOOST_PARAM][$fieldName] : 1;
         $fieldTranslit = Field::text($fieldName . '_translit', str_replace("'", '', iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $fieldValue)));
         $fieldTranslit->boost = isset($this->parameters[self::BOOST_PARAM][$fieldName]) ? $this->parameters[self::BOOST_PARAM][$fieldName] : 1.25;
     // page ID if selector defined
     if ($this->parameters[self::PAGE_ID_PARAM]) {
         $fieldValue = isset($page[Page::PAGE_ID_KEY]) ? $page[Page::PAGE_ID_KEY] : '';
         $field = Field::unIndexed(Page::PAGE_ID_KEY, $fieldValue);
     // route name if selector defined
     if ($this->parameters[self::ROUTE_NAME_PARAM]) {
         $fieldValue = isset($page[Page::ROUTE_NAME_KEY]) ? $page[Page::ROUTE_NAME_KEY] : '';
         $field = Field::unIndexed(Page::ROUTE_NAME_KEY, $fieldValue);
     return $document;
コード例 #19
  * Object constructor
  * @param string  $fileName
  * @param boolean $storeContent
  * @throws \ZendSearch\Lucene\Exception\ExtensionNotLoadedException
  * @throws \ZendSearch\Lucene\Exception\RuntimeException
 private function __construct($fileName, $storeContent)
     if (!class_exists('ZipArchive', false)) {
         throw new ExtensionNotLoadedException('MS Office documents processing functionality requires Zip extension to be loaded');
     // Document data holders
     $documentBody = array();
     $coreProperties = array();
     // Open AbstractOpenXML package
     $package = new \ZipArchive();
     // Read relations and search for officeDocument
     $relationsXml = $package->getFromName('_rels/.rels');
     if ($relationsXml === false) {
         throw new RuntimeException('Invalid archive or corrupted .docx file.');
     $relations = XMLSecurity::scan($relationsXml);
     foreach ($relations->Relationship as $rel) {
         if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) {
             // Found office document! Read in contents...
             $contents = XMLSecurity::scan($package->getFromName($this->absoluteZipPath(dirname($rel['Target']) . '/' . basename($rel['Target']))));
             $contents->registerXPathNamespace('w', self::SCHEMA_WORDPROCESSINGML);
             $paragraphs = $contents->xpath('//w:body/w:p');
             foreach ($paragraphs as $paragraph) {
                 $runs = $paragraph->xpath('.//w:r/*[name() = "w:t" or name() = "w:br"]');
                 if ($runs === false) {
                     // Paragraph doesn't contain any text or breaks
                 foreach ($runs as $run) {
                     if ($run->getName() == 'br') {
                         // Break element
                         $documentBody[] = ' ';
                     } else {
                         $documentBody[] = (string) $run;
                 // Add space after each paragraph. So they are not bound together.
                 $documentBody[] = ' ';
     // Read core properties
     $coreProperties = $this->extractMetaData($package);
     // Close file
     // Store filename
     $this->addField(Field::Text('filename', $fileName, 'UTF-8'));
     // Store contents
     if ($storeContent) {
         $this->addField(Field::Text('body', implode('', $documentBody), 'UTF-8'));
     } else {
         $this->addField(Field::UnStored('body', implode('', $documentBody), 'UTF-8'));
     // Store meta data properties
     foreach ($coreProperties as $key => $value) {
         $this->addField(Field::Text($key, $value, 'UTF-8'));
     // Store title (if not present in meta data)
     if (!isset($coreProperties['title'])) {
         $this->addField(Field::Text('title', $fileName, 'UTF-8'));
コード例 #20
  * Add index
  * @param integer $id
  * @param SearchCollection $index
 public function add($id, SearchCollection $index)
     if ($this->config()->exists('zend_search', 'index')) {
         $document = new Document();
         $document->addField(Field::keyword('id', $id));
         foreach ($index as $field) {
コード例 #21
  * @group ZF-9680
 public function testIsDeletedWithoutExplicitCommit()
     $index = Lucene\Lucene::create(__DIR__ . '/_index/_files');
     $document = new Document();
     $document->addField(Document\Field::Keyword('_id', 'myId'));
     $document->addField(Document\Field::Keyword('bla', 'blubb'));
コード例 #22
 public function index($indexer, $commit = true, $optimize = true)
     $document = new Document();
     $document->addField(Field::keyword('pk', $this->getId()));
     $document->addField(Field::Text('course', $this->getSubjectcode()));
     $document->addField(Field::Text('cross-listed', str_replace(array(';', ',', '|'), ' ', $this->getCrossListed())));
     $document->addField(Field::Text('instructor', $this->getLegalContentOwner()));
     $document->addField(Field::Unstored('comments', $this->getComments()));
     if ($commit) {
     if ($optimize) {
コード例 #23
  * Object constructor
  * @param string  $data         HTML string (may be HTML fragment, )
  * @param boolean $isFile
  * @param boolean $storeContent
  * @param string  $defaultEncoding   HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
 private function __construct($data, $isFile, $storeContent, $defaultEncoding = '')
     $this->_doc = new \DOMDocument();
     $this->_doc->substituteEntities = true;
     if ($isFile) {
         $htmlData = file_get_contents($data);
     } else {
         $htmlData = $data;
     if ($this->_doc->encoding === null) {
         // Document encoding is not recognized
         /** @todo improve HTML vs HTML fragment recognition */
         if (preg_match('/<html>/i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) {
             // It's an HTML document
             // Add additional HEAD section and recognize document
             $htmlTagOffset = $matches[0][1] + strlen($matches[0][0]);
             $this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset)) . '<head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head>' . iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset)));
             // Remove additional HEAD section
             $xpath = new \DOMXPath($this->_doc);
             $head = $xpath->query('/html/head')->item(0);
         } else {
             // It's an HTML fragment
             $this->_doc->loadHTML('<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>' . iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData) . '</body></html>');
     /** @todo Add correction of wrong HTML encoding recognition processing
      * The case is:
      * Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used,
      * even $this->_doc->encoding demonstrates another recognized encoding
     $xpath = new \DOMXPath($this->_doc);
     $docTitle = '';
     $titleNodes = $xpath->query('/html/head/title');
     foreach ($titleNodes as $titleNode) {
         // title should always have only one entry, but we process all nodeset entries
         $docTitle .= $titleNode->nodeValue . ' ';
     $this->addField(Field::Text('title', $docTitle, 'UTF-8'));
     $metaNodes = $xpath->query('/html/head/meta[@name]');
     foreach ($metaNodes as $metaNode) {
         $this->addField(Field::Text($metaNode->getAttribute('name'), $metaNode->getAttribute('content'), 'UTF-8'));
     $docBody = '';
     $bodyNodes = $xpath->query('/html/body');
     foreach ($bodyNodes as $bodyNode) {
         // body should always have only one entry, but we process all nodeset entries
         $this->_retrieveNodeText($bodyNode, $docBody);
     if ($storeContent) {
         $this->addField(Field::Text('body', $docBody, 'UTF-8'));
     } else {
         $this->addField(Field::UnStored('body', $docBody, 'UTF-8'));
     $linkNodes = $this->_doc->getElementsByTagName('a');
     foreach ($linkNodes as $linkNode) {
         if (($href = $linkNode->getAttribute('href')) != '' && (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow')) {
             $this->_links[] = $href;
     $linkNodes = $this->_doc->getElementsByTagName('area');
     foreach ($linkNodes as $linkNode) {
         if (($href = $linkNode->getAttribute('href')) != '' && (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow')) {
             $this->_links[] = $href;
     $this->_links = array_unique($this->_links);
     $linkNodes = $xpath->query('/html/head/link');
     foreach ($linkNodes as $linkNode) {
         if (($href = $linkNode->getAttribute('href')) != '') {
             $this->_headerLinks[] = $href;
     $this->_headerLinks = array_unique($this->_headerLinks);
コード例 #24
 public function addAetCommunicationToSearchIndex($index, Communication $aetCommunication)
     // Create a new document
     $document = new Document();
     $document->addField(Field::keyword('dbId', $aetCommunication->getId(), 'utf-8'));
     $document->addField(Field::unStored('title', $aetCommunication->getTitle(), 'utf-8'));
     $document->addField(Field::unStored('shortdesc', $aetCommunication->getShortDesc(), 'utf-8'));
     $document->addField(Field::unStored('body', html_entity_decode(strip_tags($aetCommunication->getBody()), ENT_SUBSTITUTE, 'UTF-8'), 'utf-8'));
     $document->addField(Field::unStored('author', $aetCommunication->getUser()->getFirstname() . " " . $aetCommunication->getUser()->getLastname(), 'utf-8'));
     // Add your document to the index
     // Commit your change
コード例 #25
ファイル: Job.php プロジェクト: arossokha/symfonytest
 public function updateLuceneIndex()
     $index = self::getLuceneIndex();
     // remove existing entries
     foreach ($index->find('pk:' . $this->getId()) as $hit) {
     // don't index expired and non-activated jobs
     if ($this->isExpired() || !$this->getIsActivated()) {
     $doc = new Document();
     // store job primary key to identify it in the search results
     $doc->addField(Document\Field::Keyword('pk', $this->getId()));
     // index job fields
     $doc->addField(Document\Field::UnStored('position', $this->getPosition(), 'utf-8'));
     $doc->addField(Document\Field::UnStored('company', $this->getCompany(), 'utf-8'));
     $doc->addField(Document\Field::UnStored('location', $this->getLocation(), 'utf-8'));
     $doc->addField(Document\Field::UnStored('description', $this->getDescription(), 'utf-8'));
     // add job to the index
コード例 #26
  * Object constructor
  * @param string  $fileName
  * @param boolean $storeContent
  * @throws \ZendSearch\Lucene\Exception\ExtensionNotLoadedException
  * @throws \ZendSearch\Lucene\Exception\RuntimeException
 private function __construct($fileName, $storeContent)
     if (!class_exists('ZipArchive', false)) {
         throw new ExtensionNotLoadedException('MS Office documents processing functionality requires Zip extension to be loaded');
     // Document data holders
     $sharedStrings = array();
     $worksheets = array();
     $documentBody = array();
     $coreProperties = array();
     // Open AbstractOpenXML package
     $package = new \ZipArchive();
     // Read relations and search for officeDocument
     $relationsXml = $package->getFromName('_rels/.rels');
     if ($relationsXml === false) {
         throw new RuntimeException('Invalid archive or corrupted .xlsx file.');
     $relations = XmlSecurity::scan($relationsXml);
     foreach ($relations->Relationship as $rel) {
         if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) {
             // Found office document! Read relations for workbook...
             $workbookRelations = XmlSecurity::scan($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")));
             $workbookRelations->registerXPathNamespace("rel", AbstractOpenXML::SCHEMA_RELATIONSHIP);
             // Read shared strings
             $sharedStringsPath = $workbookRelations->xpath("rel:Relationship[@Type='" . self::SCHEMA_SHAREDSTRINGS . "']");
             $sharedStringsPath = (string) $sharedStringsPath[0]['Target'];
             $xmlStrings = XmlSecurity::scan($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . $sharedStringsPath)));
             if (isset($xmlStrings) && isset($xmlStrings->si)) {
                 foreach ($xmlStrings->si as $val) {
                     if (isset($val->t)) {
                         $sharedStrings[] = (string) $val->t;
                     } elseif (isset($val->r)) {
                         $sharedStrings[] = $this->_parseRichText($val);
             // Loop relations for workbook and extract worksheets...
             foreach ($workbookRelations->Relationship as $workbookRelation) {
                 if ($workbookRelation["Type"] == self::SCHEMA_WORKSHEETRELATION) {
                     $worksheets[str_replace('rId', '', (string) $workbookRelation["Id"])] = XmlSecurity::scan($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($workbookRelation["Target"]) . "/" . basename($workbookRelation["Target"]))));
     // Sort worksheets
     // Extract contents from worksheets
     foreach ($worksheets as $sheetKey => $worksheet) {
         foreach ($worksheet->sheetData->row as $row) {
             foreach ($row->c as $c) {
                 // Determine data type
                 $dataType = (string) $c["t"];
                 switch ($dataType) {
                     case "s":
                         // Value is a shared string
                         if ((string) $c->v != '') {
                             $value = $sharedStrings[intval($c->v)];
                         } else {
                             $value = '';
                     case "b":
                         // Value is boolean
                         $value = (string) $c->v;
                         if ($value == '0') {
                             $value = false;
                         } elseif ($value == '1') {
                             $value = true;
                         } else {
                             $value = (bool) $c->v;
                     case "inlineStr":
                         // Value is rich text inline
                         $value = $this->_parseRichText($c->is);
                     case "e":
                         // Value is an error message
                         if ((string) $c->v != '') {
                             $value = (string) $c->v;
                         } else {
                             $value = '';
                         // Value is a string
                         $value = (string) $c->v;
                         // Check for numeric values
                         if (is_numeric($value) && $dataType != 's') {
                             if ($value == (int) $value) {
                                 $value = (int) $value;
                             } elseif ($value == (double) $value) {
                                 $value = (double) $value;
                             } elseif ($value == (double) $value) {
                                 $value = (double) $value;
                 $documentBody[] = $value;
     // Read core properties
     $coreProperties = $this->extractMetaData($package);
     // Close file
     // Store filename
     $this->addField(Field::Text('filename', $fileName, 'UTF-8'));
     // Store contents
     if ($storeContent) {
         $this->addField(Field::Text('body', implode(' ', $documentBody), 'UTF-8'));
     } else {
         $this->addField(Field::UnStored('body', implode(' ', $documentBody), 'UTF-8'));
     // Store meta data properties
     foreach ($coreProperties as $key => $value) {
         $this->addField(Field::Text($key, $value, 'UTF-8'));
     // Store title (if not present in meta data)
     if (!isset($coreProperties['title'])) {
         $this->addField(Field::Text('title', $fileName, 'UTF-8'));
コード例 #27
  * Object constructor
  * @param string  $fileName
  * @param boolean $storeContent
  * @throws \ZendSearch\Lucene\Exception\ExtensionNotLoadedException
  * @throws \ZendSearch\Lucene\Exception\RuntimeException
 private function __construct($fileName, $storeContent)
     if (!class_exists('ZipArchive', false)) {
         throw new ExtensionNotLoadedException('MS Office documents processing functionality requires Zip extension to be loaded');
     // Document data holders
     $slides = array();
     $slideNotes = array();
     $documentBody = array();
     $coreProperties = array();
     // Open AbstractOpenXML package
     $package = new \ZipArchive();
     // Read relations and search for officeDocument
     $relationsXml = $package->getFromName('_rels/.rels');
     if ($relationsXml === false) {
         throw new RuntimeException('Invalid archive or corrupted .pptx file.');
     $relations = simplexml_load_string($relationsXml);
     foreach ($relations->Relationship as $rel) {
         if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) {
             // Found office document! Search for slides...
             $slideRelations = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")));
             foreach ($slideRelations->Relationship as $slideRel) {
                 if ($slideRel["Type"] == self::SCHEMA_SLIDERELATION) {
                     // Found slide!
                     $slides[str_replace('rId', '', (string) $slideRel["Id"])] = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . basename($slideRel["Target"]))));
                     // Search for slide notes
                     $slideNotesRelations = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/_rels/" . basename($slideRel["Target"]) . ".rels")));
                     foreach ($slideNotesRelations->Relationship as $slideNoteRel) {
                         if ($slideNoteRel["Type"] == self::SCHEMA_SLIDENOTESRELATION) {
                             // Found slide notes!
                             $slideNotes[str_replace('rId', '', (string) $slideRel["Id"])] = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . dirname($slideNoteRel["Target"]) . "/" . basename($slideNoteRel["Target"]))));
     // Sort slides
     // Extract contents from slides
     foreach ($slides as $slideKey => $slide) {
         // Register namespaces
         $slide->registerXPathNamespace("p", self::SCHEMA_PRESENTATIONML);
         $slide->registerXPathNamespace("a", self::SCHEMA_DRAWINGML);
         // Fetch all text
         $textElements = $slide->xpath('//a:t');
         foreach ($textElements as $textElement) {
             $documentBody[] = (string) $textElement;
         // Extract contents from slide notes
         if (isset($slideNotes[$slideKey])) {
             // Fetch slide note
             $slideNote = $slideNotes[$slideKey];
             // Register namespaces
             $slideNote->registerXPathNamespace("p", self::SCHEMA_PRESENTATIONML);
             $slideNote->registerXPathNamespace("a", self::SCHEMA_DRAWINGML);
             // Fetch all text
             $textElements = $slideNote->xpath('//a:t');
             foreach ($textElements as $textElement) {
                 $documentBody[] = (string) $textElement;
     // Read core properties
     $coreProperties = $this->extractMetaData($package);
     // Close file
     // Store filename
     $this->addField(Field::Text('filename', $fileName, 'UTF-8'));
     // Store contents
     if ($storeContent) {
         $this->addField(Field::Text('body', implode(' ', $documentBody), 'UTF-8'));
     } else {
         $this->addField(Field::UnStored('body', implode(' ', $documentBody), 'UTF-8'));
     // Store meta data properties
     foreach ($coreProperties as $key => $value) {
         $this->addField(Field::Text($key, $value, 'UTF-8'));
     // Store title (if not present in meta data)
     if (!isset($coreProperties['title'])) {
         $this->addField(Field::Text('title', $fileName, 'UTF-8'));
コード例 #28
 public function generateSearchAction()
     $searchIndexLocation = $this->getIndexLocation();
     $index = Lucene\Lucene::create($searchIndexLocation);
     $allUsers = $this->getUserTable()->fetchAll(false);
     foreach ($allUsers as $user) {
         $id = Document\Field::keyword('userId', $user->userId);
         $firstName = Document\Field::text('firstName', $user->firstName);
         $lastName = Document\Field::text('lastName', $user->lastName);
         $email = Document\Field::text('email', $user->email);
         $role = Document\Field::text('role', $user->role);
         $activated = Document\Field::keyword('activated', $user->activated);
         $indexDoc = new Lucene\Document();
コード例 #29
  * Update document in index for model
  * @param Model $model
 public function update(Model $model)
     // Remove any existing documents for model.
     // Create new document for model.
     $doc = new Document();
     list($name, $value) = $this->config->primaryKeyPair($model);
     // Add private key.
     $doc->addField(Field::keyword($name, $value));
     // Add model's class UID.
     list($name, $value) = $this->config->classUidPair($model);
     // Add class uid for identification of model's class.
     $doc->addField(Field::Keyword($name, $value));
     // Get base fields.
     $fields = $this->config->fields($model);
     // Add fields to document to be indexed (but not stored).
     foreach ($fields as $fieldName => $options) {
         $fieldValue = $model->{trim($fieldName)};
         $field = Field::unStored(trim($fieldName), strip_tags(trim($fieldValue)));
         $field->boost = array_get($options, 'boost');
     // Get dynamic fields.
     $optionalAttributes = $this->config->optionalAttributes($model);
     // Add optional attributes to document to be indexed (but not stored).
     foreach ($optionalAttributes as $fieldName => $options) {
         $fieldValue = array_get($options, "value");
         $field = Field::unStored(trim($fieldName), strip_tags(trim($fieldValue)));
         $field->boost = array_get($options, "boost");
     // Set boost for model.
     $doc->boost = $this->config->boost($model);
     // Add document to index.
コード例 #30
  * Indexa dados nos arquivos de json
 public function index()
     $dir = realpath(dirname(__FILE__)) . DIRECTORY_SEPARATOR . "data" . DIRECTORY_SEPARATOR;
     $jsonDir = $dir . "json";
     $indexDir = $dir . "index";
     // ler aquivos json
     $files = scandir($jsonDir);
     foreach ($files as $file) {
         if ($file == '.' || $file == '..') {
         // Se arquivo existe
         if (is_file($jsonDir . DIRECTORY_SEPARATOR . $file)) {
             $json = json_decode(file_get_contents($jsonDir . DIRECTORY_SEPARATOR . $file));
             $indexName = substr($file, 0, -5);
             // Cria index
             $index = Lucene\Lucene::create($indexDir . DIRECTORY_SEPARATOR . $indexName);
             // Cria documento e define campos para indexar
             foreach ($json as $entry) {
                 $doc = new Lucene\Document();
                 $doc->addField(Lucene\Document\Field::Text('url', $entry->title));
                 $doc->addField(Lucene\Document\Field::UnStored('contents', $entry->text));