Field::UnStored, ZendSearch\Lucene\Document PHPのコード例

コード例 #1

0

ファイルを表示

ファイル: pdf.php プロジェクト: ntvis/search_lucene

 /**
  * Object constructor
  *
  * @param string  $data
  * @param boolean $storeContent
  * @throws NotIndexedException
  */
 private function __construct($data, $storeContent)
 {
     //TODO check PDF >1.5 metadata extraction
     //do the content extraction
     $parser = new Parser();
     try {
         $pdf = $parser->parseContent($data);
         $body = $pdf->getText();
         // Store contents
         if ($storeContent) {
             $this->addField(Document\Field::Text('body', $body, 'UTF-8'));
         } else {
             $this->addField(Document\Field::UnStored('body', $body, 'UTF-8'));
         }
         $details = $pdf->getDetails();
         // Store meta data properties
         foreach ($details as $key => $value) {
             $key = strtolower($key);
             if ($key === 'author') {
                 $key = 'creator';
             }
             $this->addField(Document\Field::Text($key, $value, 'UTF-8'));
         }
     } catch (\Exception $ex) {
         throw new NotIndexedException(null, null, $ex);
     }
 }

コード例 #2

0

ファイルを表示

ファイル: FieldTest.php プロジェクト: tonylow/skillslink

 public function testUnStored()
 {
     $field = Document\Field::UnStored('field', 'value');
     $this->assertEquals($field->boost, 1);
     $this->assertEquals($field->encoding, 'UTF-8');
     $this->assertEquals($field->isBinary, false);
     $this->assertEquals($field->isIndexed, true);
     $this->assertEquals($field->isStored, false);
     $this->assertEquals($field->isTokenized, true);
     $this->assertEquals($field->name, 'field');
     $this->assertEquals($field->value, 'value');
 }

コード例 #3

0

ファイルを表示

ファイル: odt.php プロジェクト: ntvis/search_lucene

 /**
  * Object constructor
  *
  * @param string $fileName
  * @param boolean $storeContent
  * @throws ExtensionNotLoadedException
  * @throws RuntimeException
  */
 private function __construct($fileName, $storeContent)
 {
     if (!class_exists('ZipArchive', false)) {
         throw new ExtensionNotLoadedException('Open Document Text processing functionality requires Zip extension to be loaded');
     }
     // Document data holders
     $documentHeadlines = array();
     $documentParagraphs = array();
     // Open OpenXML package
     $package = new \ZipArchive();
     $package->open($fileName);
     // Read relations and search for officeDocument
     $content = $package->getFromName('content.xml');
     if ($content === false) {
         throw new RuntimeException('Invalid archive or corrupted .odt file.');
     }
     // Prevent php from loading remote resources
     $loadEntities = libxml_disable_entity_loader(true);
     $sxe = simplexml_load_string($content, 'SimpleXMLElement', LIBXML_NOBLANKS | LIBXML_COMPACT);
     // Restore entity loader state
     libxml_disable_entity_loader($loadEntities);
     foreach ($sxe->xpath('//text:h') as $headline) {
         $h = strip_tags($headline->asXML());
         $documentHeadlines[] = $h;
     }
     foreach ($sxe->xpath('//text:p') as $paragraph) {
         $p = strip_tags($paragraph->asXML());
         $documentParagraphs[] = $p;
     }
     // Read core properties
     $coreProperties = $this->extractMetaData($package);
     // Close file
     $package->close();
     // Store contents
     if ($storeContent) {
         $this->addField(Field::Text('headlines', implode(' ', $documentHeadlines), 'UTF-8'));
         $this->addField(Field::Text('body', implode('', $documentParagraphs), 'UTF-8'));
     } else {
         $this->addField(Field::UnStored('headlines', implode(' ', $documentHeadlines), 'UTF-8'));
         $this->addField(Field::UnStored('body', implode('', $documentParagraphs), 'UTF-8'));
     }
     // Store meta data properties
     foreach ($coreProperties as $key => $value) {
         $this->addField(Field::Text($key, $value, 'UTF-8'));
     }
     // Store title (if not present in meta data)
     if (!isset($coreProperties['title'])) {
         $this->addField(Field::Text('title', $fileName, 'UTF-8'));
     }
 }

コード例 #4

0

ファイルを表示

ファイル: SearchLucene.php プロジェクト: logue/pukiwiki_adv

 /**
  * インデックスファイルを生成
  */
 public static function updateIndex()
 {
     if (empty(self::$igo)) {
         self::$igo = new Tagger(array('dict_dir' => LIB_DIR . 'ipadic', 'reduce_mode' => true));
     }
     Analyzer::setDefault(new Utf8());
     // 索引の作成
     $index = Lucene::create(CACHE_DIR . self::INDEX_NAME);
     foreach (Listing::pages() as $page) {
         if (empty($page)) {
             continue;
         }
         $wiki = Factory::Wiki($page);
         // 読む権限がない場合スキップ
         if (!$wiki->isReadable() || $wiki->isHidden()) {
             continue;
         }
         /*
         			// HTML出力
         			$html[] = '<html><head>';
         			$html[] = '<meta http-equiv="Content-type" content="text/html; charset=UTF-8"/>';
         			$html[] = '<title>' . $wiki->title() . '</title>';
         			$html[] = '</head>';
         			$html[] = '<body>' . $wiki->render() . '</body>';
         			$html[] = '</html>';
         */
         $doc = new LuceneDoc();
         $doc->addField(Field::Text('title', $wiki->title()));
         // Store document URL to identify it in the search results
         $doc->addField(Field::Text('url', $wiki->uri()));
         // Index document contents
         //$contents = join(" ", self::$igo->wakati(strip_tags($wiki->render())));
         $contents = strip_tags($wiki->render());
         $doc->addField(Field::UnStored('contents', $contents));
         // 索引へ文書の登録
         $index->addDocument($doc);
     }
     $index->optimize();
 }

コード例 #5

0

ファイルを表示

ファイル: HTML.php プロジェクト: tonylow/skillslink

 /**
  * Object constructor
  *
  * @param string  $data         HTML string (may be HTML fragment, )
  * @param boolean $isFile
  * @param boolean $storeContent
  * @param string  $defaultEncoding   HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
  */
 private function __construct($data, $isFile, $storeContent, $defaultEncoding = '')
 {
     $this->_doc = new \DOMDocument();
     $this->_doc->substituteEntities = true;
     if ($isFile) {
         $htmlData = file_get_contents($data);
     } else {
         $htmlData = $data;
     }
     ErrorHandler::start(E_WARNING);
     $this->_doc->loadHTML($htmlData);
     ErrorHandler::stop();
     if ($this->_doc->encoding === null) {
         // Document encoding is not recognized
         /** @todo improve HTML vs HTML fragment recognition */
         if (preg_match('/<html>/i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) {
             // It's an HTML document
             // Add additional HEAD section and recognize document
             $htmlTagOffset = $matches[0][1] + strlen($matches[0][0]);
             ErrorHandler::start(E_WARNING);
             $this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset)) . '<head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head>' . iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset)));
             ErrorHandler::stop();
             // Remove additional HEAD section
             $xpath = new \DOMXPath($this->_doc);
             $head = $xpath->query('/html/head')->item(0);
             $head->parentNode->removeChild($head);
         } else {
             // It's an HTML fragment
             ErrorHandler::start(E_WARNING);
             $this->_doc->loadHTML('<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>' . iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData) . '</body></html>');
             ErrorHandler::stop();
         }
     }
     /** @todo Add correction of wrong HTML encoding recognition processing
      * The case is:
      * Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used,
      * even $this->_doc->encoding demonstrates another recognized encoding
      */
     $xpath = new \DOMXPath($this->_doc);
     $docTitle = '';
     $titleNodes = $xpath->query('/html/head/title');
     foreach ($titleNodes as $titleNode) {
         // title should always have only one entry, but we process all nodeset entries
         $docTitle .= $titleNode->nodeValue . ' ';
     }
     $this->addField(Field::Text('title', $docTitle, 'UTF-8'));
     $metaNodes = $xpath->query('/html/head/meta[@name]');
     foreach ($metaNodes as $metaNode) {
         $this->addField(Field::Text($metaNode->getAttribute('name'), $metaNode->getAttribute('content'), 'UTF-8'));
     }
     $docBody = '';
     $bodyNodes = $xpath->query('/html/body');
     foreach ($bodyNodes as $bodyNode) {
         // body should always have only one entry, but we process all nodeset entries
         $this->_retrieveNodeText($bodyNode, $docBody);
     }
     if ($storeContent) {
         $this->addField(Field::Text('body', $docBody, 'UTF-8'));
     } else {
         $this->addField(Field::UnStored('body', $docBody, 'UTF-8'));
     }
     $linkNodes = $this->_doc->getElementsByTagName('a');
     foreach ($linkNodes as $linkNode) {
         if (($href = $linkNode->getAttribute('href')) != '' && (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow')) {
             $this->_links[] = $href;
         }
     }
     $linkNodes = $this->_doc->getElementsByTagName('area');
     foreach ($linkNodes as $linkNode) {
         if (($href = $linkNode->getAttribute('href')) != '' && (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow')) {
             $this->_links[] = $href;
         }
     }
     $this->_links = array_unique($this->_links);
     $linkNodes = $xpath->query('/html/head/link');
     foreach ($linkNodes as $linkNode) {
         if (($href = $linkNode->getAttribute('href')) != '') {
             $this->_headerLinks[] = $href;
         }
     }
     $this->_headerLinks = array_unique($this->_headerLinks);
 }

コード例 #6

0

ファイルを表示

ファイル: Pptx.php プロジェクト: jclausen/ZendSearch

 /**
  * Object constructor
  *
  * @param string  $fileName
  * @param boolean $storeContent
  * @throws \ZendSearch\Lucene\Exception\ExtensionNotLoadedException
  * @throws \ZendSearch\Lucene\Exception\RuntimeException
  */
 private function __construct($fileName, $storeContent)
 {
     if (!class_exists('ZipArchive', false)) {
         throw new ExtensionNotLoadedException('MS Office documents processing functionality requires Zip extension to be loaded');
     }
     // Document data holders
     $slides = array();
     $slideNotes = array();
     $documentBody = array();
     $coreProperties = array();
     // Open AbstractOpenXML package
     $package = new \ZipArchive();
     $package->open($fileName);
     // Read relations and search for officeDocument
     $relationsXml = $package->getFromName('_rels/.rels');
     if ($relationsXml === false) {
         throw new RuntimeException('Invalid archive or corrupted .pptx file.');
     }
     $relations = simplexml_load_string($relationsXml);
     foreach ($relations->Relationship as $rel) {
         if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) {
             // Found office document! Search for slides...
             $slideRelations = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")));
             foreach ($slideRelations->Relationship as $slideRel) {
                 if ($slideRel["Type"] == self::SCHEMA_SLIDERELATION) {
                     // Found slide!
                     $slides[str_replace('rId', '', (string) $slideRel["Id"])] = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . basename($slideRel["Target"]))));
                     // Search for slide notes
                     $slideNotesRelations = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/_rels/" . basename($slideRel["Target"]) . ".rels")));
                     foreach ($slideNotesRelations->Relationship as $slideNoteRel) {
                         if ($slideNoteRel["Type"] == self::SCHEMA_SLIDENOTESRELATION) {
                             // Found slide notes!
                             $slideNotes[str_replace('rId', '', (string) $slideRel["Id"])] = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . dirname($slideNoteRel["Target"]) . "/" . basename($slideNoteRel["Target"]))));
                             break;
                         }
                     }
                 }
             }
             break;
         }
     }
     // Sort slides
     ksort($slides);
     ksort($slideNotes);
     // Extract contents from slides
     foreach ($slides as $slideKey => $slide) {
         // Register namespaces
         $slide->registerXPathNamespace("p", self::SCHEMA_PRESENTATIONML);
         $slide->registerXPathNamespace("a", self::SCHEMA_DRAWINGML);
         // Fetch all text
         $textElements = $slide->xpath('//a:t');
         foreach ($textElements as $textElement) {
             $documentBody[] = (string) $textElement;
         }
         // Extract contents from slide notes
         if (isset($slideNotes[$slideKey])) {
             // Fetch slide note
             $slideNote = $slideNotes[$slideKey];
             // Register namespaces
             $slideNote->registerXPathNamespace("p", self::SCHEMA_PRESENTATIONML);
             $slideNote->registerXPathNamespace("a", self::SCHEMA_DRAWINGML);
             // Fetch all text
             $textElements = $slideNote->xpath('//a:t');
             foreach ($textElements as $textElement) {
                 $documentBody[] = (string) $textElement;
             }
         }
     }
     // Read core properties
     $coreProperties = $this->extractMetaData($package);
     // Close file
     $package->close();
     // Store filename
     $this->addField(Field::Text('filename', $fileName, 'UTF-8'));
     // Store contents
     if ($storeContent) {
         $this->addField(Field::Text('body', implode(' ', $documentBody), 'UTF-8'));
     } else {
         $this->addField(Field::UnStored('body', implode(' ', $documentBody), 'UTF-8'));
     }
     // Store meta data properties
     foreach ($coreProperties as $key => $value) {
         $this->addField(Field::Text($key, $value, 'UTF-8'));
     }
     // Store title (if not present in meta data)
     if (!isset($coreProperties['title'])) {
         $this->addField(Field::Text('title', $fileName, 'UTF-8'));
     }
 }

コード例 #7

0

ファイルを表示

ファイル: Xlsx.php プロジェクト: ntvis/search_lucene

 /**
  * Object constructor
  *
  * @param string  $fileName
  * @param boolean $storeContent
  * @throws \ZendSearch\Lucene\Exception\ExtensionNotLoadedException
  * @throws \ZendSearch\Lucene\Exception\RuntimeException
  */
 private function __construct($fileName, $storeContent)
 {
     if (!class_exists('ZipArchive', false)) {
         throw new ExtensionNotLoadedException('MS Office documents processing functionality requires Zip extension to be loaded');
     }
     // Document data holders
     $sharedStrings = array();
     $worksheets = array();
     $documentBody = array();
     $coreProperties = array();
     // Open AbstractOpenXML package
     $package = new \ZipArchive();
     $package->open($fileName);
     // Read relations and search for officeDocument
     $relationsXml = $package->getFromName('_rels/.rels');
     if ($relationsXml === false) {
         throw new RuntimeException('Invalid archive or corrupted .xlsx file.');
     }
     $relations = XmlSecurity::scan($relationsXml);
     foreach ($relations->Relationship as $rel) {
         if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) {
             // Found office document! Read relations for workbook...
             $workbookRelations = XmlSecurity::scan($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")));
             $workbookRelations->registerXPathNamespace("rel", AbstractOpenXML::SCHEMA_RELATIONSHIP);
             // Read shared strings
             $sharedStringsPath = $workbookRelations->xpath("rel:Relationship[@Type='" . self::SCHEMA_SHAREDSTRINGS . "']");
             $sharedStringsPath = (string) $sharedStringsPath[0]['Target'];
             $xmlStrings = XmlSecurity::scan($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . $sharedStringsPath)));
             if (isset($xmlStrings) && isset($xmlStrings->si)) {
                 foreach ($xmlStrings->si as $val) {
                     if (isset($val->t)) {
                         $sharedStrings[] = (string) $val->t;
                     } elseif (isset($val->r)) {
                         $sharedStrings[] = $this->_parseRichText($val);
                     }
                 }
             }
             // Loop relations for workbook and extract worksheets...
             foreach ($workbookRelations->Relationship as $workbookRelation) {
                 if ($workbookRelation["Type"] == self::SCHEMA_WORKSHEETRELATION) {
                     $worksheets[str_replace('rId', '', (string) $workbookRelation["Id"])] = XmlSecurity::scan($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($workbookRelation["Target"]) . "/" . basename($workbookRelation["Target"]))));
                 }
             }
             break;
         }
     }
     // Sort worksheets
     ksort($worksheets);
     // Extract contents from worksheets
     foreach ($worksheets as $sheetKey => $worksheet) {
         foreach ($worksheet->sheetData->row as $row) {
             foreach ($row->c as $c) {
                 // Determine data type
                 $dataType = (string) $c["t"];
                 switch ($dataType) {
                     case "s":
                         // Value is a shared string
                         if ((string) $c->v != '') {
                             $value = $sharedStrings[intval($c->v)];
                         } else {
                             $value = '';
                         }
                         break;
                     case "b":
                         // Value is boolean
                         $value = (string) $c->v;
                         if ($value == '0') {
                             $value = false;
                         } elseif ($value == '1') {
                             $value = true;
                         } else {
                             $value = (bool) $c->v;
                         }
                         break;
                     case "inlineStr":
                         // Value is rich text inline
                         $value = $this->_parseRichText($c->is);
                         break;
                     case "e":
                         // Value is an error message
                         if ((string) $c->v != '') {
                             $value = (string) $c->v;
                         } else {
                             $value = '';
                         }
                         break;
                     default:
                         // Value is a string
                         $value = (string) $c->v;
                         // Check for numeric values
                         if (is_numeric($value) && $dataType != 's') {
                             if ($value == (int) $value) {
                                 $value = (int) $value;
                             } elseif ($value == (double) $value) {
                                 $value = (double) $value;
                             } elseif ($value == (double) $value) {
                                 $value = (double) $value;
                             }
                         }
                 }
                 $documentBody[] = $value;
             }
         }
     }
     // Read core properties
     $coreProperties = $this->extractMetaData($package);
     // Close file
     $package->close();
     // Store filename
     $this->addField(Field::Text('filename', $fileName, 'UTF-8'));
     // Store contents
     if ($storeContent) {
         $this->addField(Field::Text('body', implode(' ', $documentBody), 'UTF-8'));
     } else {
         $this->addField(Field::UnStored('body', implode(' ', $documentBody), 'UTF-8'));
     }
     // Store meta data properties
     foreach ($coreProperties as $key => $value) {
         $this->addField(Field::Text($key, $value, 'UTF-8'));
     }
     // Store title (if not present in meta data)
     if (!isset($coreProperties['title'])) {
         $this->addField(Field::Text('title', $fileName, 'UTF-8'));
     }
 }

コード例 #8

0

ファイルを表示

ファイル: Docx.php プロジェクト: ntvis/search_lucene

 /**
  * Object constructor
  *
  * @param string  $fileName
  * @param boolean $storeContent
  * @throws \ZendSearch\Lucene\Exception\ExtensionNotLoadedException
  * @throws \ZendSearch\Lucene\Exception\RuntimeException
  */
 private function __construct($fileName, $storeContent)
 {
     if (!class_exists('ZipArchive', false)) {
         throw new ExtensionNotLoadedException('MS Office documents processing functionality requires Zip extension to be loaded');
     }
     // Document data holders
     $documentBody = array();
     $coreProperties = array();
     // Open AbstractOpenXML package
     $package = new \ZipArchive();
     $package->open($fileName);
     // Read relations and search for officeDocument
     $relationsXml = $package->getFromName('_rels/.rels');
     if ($relationsXml === false) {
         throw new RuntimeException('Invalid archive or corrupted .docx file.');
     }
     $relations = XMLSecurity::scan($relationsXml);
     foreach ($relations->Relationship as $rel) {
         if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) {
             // Found office document! Read in contents...
             $contents = XMLSecurity::scan($package->getFromName($this->absoluteZipPath(dirname($rel['Target']) . '/' . basename($rel['Target']))));
             $contents->registerXPathNamespace('w', self::SCHEMA_WORDPROCESSINGML);
             $paragraphs = $contents->xpath('//w:body/w:p');
             foreach ($paragraphs as $paragraph) {
                 $runs = $paragraph->xpath('.//w:r/*[name() = "w:t" or name() = "w:br"]');
                 if ($runs === false) {
                     // Paragraph doesn't contain any text or breaks
                     continue;
                 }
                 foreach ($runs as $run) {
                     if ($run->getName() == 'br') {
                         // Break element
                         $documentBody[] = ' ';
                     } else {
                         $documentBody[] = (string) $run;
                     }
                 }
                 // Add space after each paragraph. So they are not bound together.
                 $documentBody[] = ' ';
             }
             break;
         }
     }
     // Read core properties
     $coreProperties = $this->extractMetaData($package);
     // Close file
     $package->close();
     // Store filename
     $this->addField(Field::Text('filename', $fileName, 'UTF-8'));
     // Store contents
     if ($storeContent) {
         $this->addField(Field::Text('body', implode('', $documentBody), 'UTF-8'));
     } else {
         $this->addField(Field::UnStored('body', implode('', $documentBody), 'UTF-8'));
     }
     // Store meta data properties
     foreach ($coreProperties as $key => $value) {
         $this->addField(Field::Text($key, $value, 'UTF-8'));
     }
     // Store title (if not present in meta data)
     if (!isset($coreProperties['title'])) {
         $this->addField(Field::Text('title', $fileName, 'UTF-8'));
     }
 }

コード例 #9

0

ファイルを表示

ファイル: Job.php プロジェクト: arossokha/symfonytest

 /**
  * @ORM\PostPersist
  */
 public function updateLuceneIndex()
 {
     $index = self::getLuceneIndex();
     // remove existing entries
     foreach ($index->find('pk:' . $this->getId()) as $hit) {
         $index->delete($hit->id);
     }
     // don't index expired and non-activated jobs
     if ($this->isExpired() || !$this->getIsActivated()) {
         return;
     }
     $doc = new Document();
     // store job primary key to identify it in the search results
     $doc->addField(Document\Field::Keyword('pk', $this->getId()));
     // index job fields
     $doc->addField(Document\Field::UnStored('position', $this->getPosition(), 'utf-8'));
     $doc->addField(Document\Field::UnStored('company', $this->getCompany(), 'utf-8'));
     $doc->addField(Document\Field::UnStored('location', $this->getLocation(), 'utf-8'));
     $doc->addField(Document\Field::UnStored('description', $this->getDescription(), 'utf-8'));
     // add job to the index
     $index->addDocument($doc);
     $index->commit();
 }

コード例 #10

0

ファイルを表示

ファイル: indexer.php プロジェクト: ntvis/search_lucene

 /**
  * index a file
  *
  * @param File $file the file to be indexed
  * @param bool $commit
  *
  * @return bool true when something was stored in the index, false otherwise (eg, folders are not indexed)
  * @throws NotIndexedException when an unsupported file type is encountered
  */
 public function indexFile(File $file, $commit = true)
 {
     // we decide how to index on mime type or file extension
     $mimeType = $file->getMimeType();
     $fileExtension = strtolower(pathinfo($file->getName(), PATHINFO_EXTENSION));
     // initialize plain lucene document
     $doc = new Document();
     // index content for local files only
     $storage = $file->getStorage();
     if ($storage->isLocal()) {
         $path = $storage->getLocalFile($file->getInternalPath());
         //try to use special lucene document types
         if ('text/html' === $mimeType) {
             //TODO could be indexed, even if not local
             $doc = HTML::loadHTML($file->getContent());
         } else {
             if ('text/' === substr($mimeType, 0, 5) || 'application/x-tex' === $mimeType) {
                 $body = $file->getContent();
                 if ($body != '') {
                     $doc->addField(Document\Field::UnStored('body', $body));
                 }
             } else {
                 if ('application/pdf' === $mimeType) {
                     $doc = Pdf::loadPdf($file->getContent());
                     // the zend classes only understand docx and not doc files
                 } else {
                     if ($fileExtension === 'docx') {
                         $doc = Document\Docx::loadDocxFile($path);
                         //} else if ('application/msexcel' === $mimeType) {
                     } else {
                         if ($fileExtension === 'xlsx') {
                             $doc = Document\Xlsx::loadXlsxFile($path);
                             //} else if ('application/mspowerpoint' === $mimeType) {
                         } else {
                             if ($fileExtension === 'pptx') {
                                 $doc = Document\Pptx::loadPptxFile($path);
                             } else {
                                 if ($fileExtension === 'odt') {
                                     $doc = Odt::loadOdtFile($path);
                                 } else {
                                     if ($fileExtension === 'ods') {
                                         $doc = Ods::loadOdsFile($path);
                                     } else {
                                         throw new NotIndexedException();
                                     }
                                 }
                             }
                         }
                     }
                 }
             }
         }
     }
     // Store filecache id as unique id to lookup by when deleting
     $doc->addField(Document\Field::Keyword('fileId', $file->getId()));
     // Store document path for the search results
     $doc->addField(Document\Field::Text('path', $file->getPath(), 'UTF-8'));
     $doc->addField(Document\Field::unIndexed('mtime', $file->getMTime()));
     $doc->addField(Document\Field::unIndexed('size', $file->getSize()));
     $doc->addField(Document\Field::unIndexed('mimetype', $mimeType));
     $this->index->updateFile($doc, $file->getId(), $commit);
     return true;
 }

コード例 #11

-1

ファイルを表示

ファイル: Indexer.php プロジェクト: highestgoodlikewater/InformationRetrieval

 /**
  * Indexa dados nos arquivos de json
  */
 public function index()
 {
     $dir = realpath(dirname(__FILE__)) . DIRECTORY_SEPARATOR . "data" . DIRECTORY_SEPARATOR;
     $jsonDir = $dir . "json";
     $indexDir = $dir . "index";
     // ler aquivos json
     $files = scandir($jsonDir);
     foreach ($files as $file) {
         if ($file == '.' || $file == '..') {
             continue;
         }
         // Se arquivo existe
         if (is_file($jsonDir . DIRECTORY_SEPARATOR . $file)) {
             $json = json_decode(file_get_contents($jsonDir . DIRECTORY_SEPARATOR . $file));
             $indexName = substr($file, 0, -5);
             // Cria index
             $index = Lucene\Lucene::create($indexDir . DIRECTORY_SEPARATOR . $indexName);
             // Cria documento e define campos para indexar
             foreach ($json as $entry) {
                 $doc = new Lucene\Document();
                 $doc->addField(Lucene\Document\Field::Text('url', $entry->title));
                 $doc->addField(Lucene\Document\Field::UnStored('contents', $entry->text));
                 $index->addDocument($doc);
             }
         }
     }
 }

PHP ZendSearch\Lucene\Document Field::UnStoredの例