/** * Returns a string containing the text in the given Microsoft Word DOCX * document. * * @param String $filename Full filesystem path to the file to process. * @return String Text extracted from the file. */ public static function extract($filename) { if (!extension_loaded('zip')) { return ''; } if (!file_exists($filename)) { return ''; } try { $doc = Zend_Search_Lucene_Document_Docx::loadDocxFile($filename, true); } catch (Exception $e) { return ''; } return $doc->body; }
/** * php index.php docx index */ public function index() { $indexDir = APP_PATH . '/' . self::INDEX_DIR; is_dir($indexDir) || mkdir($indexDir, 0777, true); $index = self::create($indexDir); $inputDir = APP_PATH . '/' . self::INPUT_DIR; is_dir($inputDir) || mkdir($inputDir, 0777, true); echo sprintf("Create index for %s \n\n", $inputDir); foreach (new DirectoryIterator($inputDir) as $fileInfo) { if ($fileInfo->isDot()) { continue; } echo sprintf("File : %s \n", $fileInfo->getFilename()); $doc = Zend_Search_Lucene_Document_Docx::loadDocxFile($inputDir . $fileInfo->getFilename()); $index->addDocument($doc); } echo "\n###Done###\n"; }
public function testDocx() { if (!class_exists('ZipArchive')) { $this->markTestSkipped('ZipArchive class (Zip extension) is not loaded'); } $docxDocument = Zend_Search_Lucene_Document_Docx::loadDocxFile(dirname(__FILE__) . '/_openXmlDocuments/test.docx', true); $this->assertTrue($docxDocument instanceof Zend_Search_Lucene_Document_Docx); $this->assertEquals($docxDocument->getFieldValue('title'), 'Test document'); $this->assertEquals($docxDocument->getFieldValue('description'), 'This is a test document which can be used to demonstrate something.'); $this->assertTrue($docxDocument->getFieldValue('body') != ''); try { $docxDocument1 = Zend_Search_Lucene_Document_Docx::loadDocxFile(dirname(__FILE__) . '/_openXmlDocuments/dummy.docx', true); $this->fail('File not readable exception is expected.'); } catch (Zend_Search_Lucene_Document_Exception $e) { if (strpos($e->getMessage(), 'is not readable') === false) { // Passthrough exception throw $e; } } }
/** * index a file * * @author Jörn Dreyer <*****@*****.**> * * @param string $path the path of the file * * @return bool */ public static function indexFile($path = '', $user = null) { if (!Filesystem::isValidPath($path)) { return; } if ($path === '') { //ignore the empty path element return false; } if (is_null($user)) { $view = Filesystem::getView(); $user = \OCP\User::getUser(); } else { $view = new \OC\Files\View('/' . $user . '/files'); } if (!$view) { Util::writeLog('search_lucene', 'could not resolve filesystem view', Util::WARN); return false; } if (!$view->file_exists($path)) { Util::writeLog('search_lucene', 'file vanished, ignoring', Util::DEBUG); return true; } $root = $view->getRoot(); $pk = md5($root . $path); // the cache already knows mime and other basic stuff $data = $view->getFileInfo($path); if (isset($data['mimetype'])) { $mimeType = $data['mimetype']; // initialize plain lucene document $doc = new \Zend_Search_Lucene_Document(); // index content for local files only $localFile = $view->getLocalFile($path); if ($localFile) { //try to use special lucene document types if ('text/plain' === $mimeType) { $body = $view->file_get_contents($path); if ($body != '') { $doc->addField(\Zend_Search_Lucene_Field::UnStored('body', $body)); } } else { if ('text/html' === $mimeType) { //TODO could be indexed, even if not local $doc = \Zend_Search_Lucene_Document_Html::loadHTML($view->file_get_contents($path)); } else { if ('application/pdf' === $mimeType) { $doc = Pdf::loadPdf($view->file_get_contents($path)); // commented the mimetype checks, as the zend classes only understand docx and not doc files. // FIXME distinguish doc and docx, xls and xlsx, ppt and pptx, in oc core mimetype helper ... //} else if ('application/msword' === $mimeType) { } else { if (strtolower(substr($data['name'], -5)) === '.docx') { $doc = \Zend_Search_Lucene_Document_Docx::loadDocxFile($localFile); //} else if ('application/msexcel' === $mimeType) { } else { if (strtolower(substr($data['name'], -5)) === '.xlsx') { $doc = \Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($localFile); //} else if ('application/mspowerpoint' === $mimeType) { } else { if (strtolower(substr($data['name'], -5)) === '.pptx') { $doc = \Zend_Search_Lucene_Document_Pptx::loadPptxFile($localFile); } else { if (strtolower(substr($data['name'], -4)) === '.odt') { $doc = Odt::loadOdtFile($localFile); } else { if (strtolower(substr($data['name'], -4)) === '.ods') { $doc = Ods::loadOdsFile($localFile); } } } } } } } } } // Store filecache id as unique id to lookup by when deleting $doc->addField(\Zend_Search_Lucene_Field::Keyword('pk', $pk)); // Store filename $doc->addField(\Zend_Search_Lucene_Field::Text('filename', $data['name'], 'UTF-8')); // Store document path to identify it in the search results $doc->addField(\Zend_Search_Lucene_Field::Text('path', $path, 'UTF-8')); $doc->addField(\Zend_Search_Lucene_Field::unIndexed('size', $data['size'])); $doc->addField(\Zend_Search_Lucene_Field::unIndexed('mimetype', $mimeType)); //self::extractMetadata($doc, $path, $view, $mimeType); Lucene::updateFile($doc, $path, $user); return true; } else { Util::writeLog('search_lucene', 'need mimetype for content extraction', Util::ERROR); return false; } }
/** * Given a path to a DocX document returns a lucene document with filename and contents set. * @param $path * @return Zend_Search_Lucene_Document */ function createDocXDocument($path) { $doc = Zend_Search_Lucene_Document_Docx::loadDocxFile($path); $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path))); return $doc; }
/** * @param AJXP_Node $ajxpNode * @param Zend_Search_Lucene_Interface $index * @throws Exception * @return Zend_Search_Lucene_Document */ public function createIndexedDocument($ajxpNode, &$index) { if (!empty($this->metaFields)) { $ajxpNode->loadNodeInfo(false, false, "all"); } else { $ajxpNode->loadNodeInfo(); } $ext = strtolower(pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION)); $parseContent = $this->indexContent; if ($parseContent && $ajxpNode->bytesize > $this->getFilteredOption("PARSE_CONTENT_MAX_SIZE")) { $parseContent = false; } if ($parseContent && in_array($ext, explode(",", $this->getFilteredOption("PARSE_CONTENT_HTML")))) { $doc = @Zend_Search_Lucene_Document_Html::loadHTMLFile($ajxpNode->getUrl()); } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Docx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Docx::loadDocxFile($realFile); } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Pptx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Pptx::loadPptxFile($realFile); } elseif ($parseContent && $ext == "xlsx" && class_exists("Zend_Search_Lucene_Document_Xlsx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($realFile); } else { $doc = new Zend_Search_Lucene_Document(); } if ($doc == null) { throw new Exception("Could not load document"); } $doc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl()), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath())), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Text("basename", basename($ajxpNode->getPath())), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_node", "yes"), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "shared")); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_modiftime", date("Ymd", $ajxpNode->ajxp_modiftime))); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_bytesize", $ajxpNode->bytesize)); $ajxpMime = $ajxpNode->ajxp_mime; if (empty($ajxpMime)) { $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION))); } else { $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", $ajxpNode->ajxp_mime)); } // Store a cached copy of the metadata $serializedMeta = base64_encode(serialize($ajxpNode->metadata)); $doc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta)); if (isset($ajxpNode->indexableMetaKeys["shared"])) { foreach ($ajxpNode->indexableMetaKeys["shared"] as $sharedField) { if ($ajxpNode->{$sharedField}) { $doc->addField(Zend_search_Lucene_Field::keyword($sharedField, $ajxpNode->{$sharedField})); } } } foreach ($this->metaFields as $field) { if ($ajxpNode->{$field} != null) { $doc->addField(Zend_Search_Lucene_Field::Text("ajxp_meta_{$field}", $ajxpNode->{$field}), SystemTextEncoding::getEncoding()); } } if (isset($ajxpNode->indexableMetaKeys["user"]) && count($ajxpNode->indexableMetaKeys["user"]) && AuthService::usersEnabled() && AuthService::getLoggedUser() != null) { $privateDoc = new Zend_Search_Lucene_Document(); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl(), SystemTextEncoding::getEncoding())); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath()), SystemTextEncoding::getEncoding())); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "user")); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_user", AuthService::getLoggedUser()->getId())); foreach ($ajxpNode->indexableMetaKeys["user"] as $userField) { if ($ajxpNode->{$userField}) { $privateDoc->addField(Zend_search_Lucene_Field::keyword($userField, $ajxpNode->{$userField})); } } $privateDoc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta)); $index->addDocument($privateDoc); } if ($parseContent) { $body = $this->extractIndexableContent($ajxpNode); if (!empty($body)) { $doc->addField(Zend_Search_Lucene_Field::unStored("body", $body)); } } $index->addDocument($doc); return $doc; }
/** * @param AJXP_Node $ajxpNode * @param Zend_Search_Lucene_Interface $index * @throws Exception * @return Zend_Search_Lucene_Document */ public function createIndexedDocument($ajxpNode, &$index) { $ajxpNode->loadNodeInfo(); $ext = strtolower(pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION)); $parseContent = $this->indexContent; if ($parseContent && $ajxpNode->bytesize > $this->getFilteredOption("PARSE_CONTENT_MAX_SIZE")) { $parseContent = false; } if ($parseContent && in_array($ext, explode(",", $this->getFilteredOption("PARSE_CONTENT_HTML")))) { $doc = @Zend_Search_Lucene_Document_Html::loadHTMLFile($ajxpNode->getUrl()); } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Docx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Docx::loadDocxFile($realFile); } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Pptx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Pptx::loadPptxFile($realFile); } elseif ($parseContent && $ext == "xlsx" && class_exists("Zend_Search_Lucene_Document_Xlsx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($realFile); } else { $doc = new Zend_Search_Lucene_Document(); } if ($doc == null) { throw new Exception("Could not load document"); } $doc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl()), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath())), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Text("basename", basename($ajxpNode->getPath())), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_node", "yes"), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "shared")); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_modiftime", date("Ymd", $ajxpNode->ajxp_modiftime))); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_bytesize", $ajxpNode->bytesize)); $ajxpMime = $ajxpNode->ajxp_mime; if (empty($ajxpMime)) { $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION))); } else { $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", $ajxpNode->ajxp_mime)); } // Store a cached copy of the metadata $serializedMeta = base64_encode(serialize($ajxpNode->metadata)); $doc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta)); if (isset($ajxpNode->indexableMetaKeys["shared"])) { foreach ($ajxpNode->indexableMetaKeys["shared"] as $sharedField) { if ($ajxpNode->{$sharedField}) { $doc->addField(Zend_search_Lucene_Field::keyword($sharedField, $ajxpNode->{$sharedField})); } } } foreach ($this->metaFields as $field) { if ($ajxpNode->{$field} != null) { $doc->addField(Zend_Search_Lucene_Field::Text("ajxp_meta_{$field}", $ajxpNode->{$field}), SystemTextEncoding::getEncoding()); } } if (isset($ajxpNode->indexableMetaKeys["user"]) && count($ajxpNode->indexableMetaKeys["user"]) && AuthService::usersEnabled() && AuthService::getLoggedUser() != null) { $privateDoc = new Zend_Search_Lucene_Document(); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl(), SystemTextEncoding::getEncoding())); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath()), SystemTextEncoding::getEncoding())); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "user")); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_user", AuthService::getLoggedUser()->getId())); foreach ($ajxpNode->indexableMetaKeys["user"] as $userField) { if ($ajxpNode->{$userField}) { $privateDoc->addField(Zend_search_Lucene_Field::keyword($userField, $ajxpNode->{$userField})); } } $privateDoc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta)); $index->addDocument($privateDoc); } if ($parseContent && in_array($ext, explode(",", $this->getFilteredOption("PARSE_CONTENT_TXT")))) { $doc->addField(Zend_Search_Lucene_Field::unStored("body", file_get_contents($ajxpNode->getUrl()))); } $unoconv = $this->getFilteredOption("UNOCONV"); $pipe = false; if ($parseContent && !empty($unoconv) && in_array($ext, array("doc", "odt", "xls", "ods"))) { $targetExt = "txt"; if (in_array($ext, array("xls", "ods"))) { $targetExt = "csv"; } else { if (in_array($ext, array("odp", "ppt"))) { $targetExt = "pdf"; $pipe = true; } } $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $unoconv = "HOME=" . AJXP_Utils::getAjxpTmpDir() . " " . $unoconv . " --stdout -f {$targetExt} " . escapeshellarg($realFile); if ($pipe) { $newTarget = str_replace(".{$ext}", ".pdf", $realFile); $unoconv .= " > {$newTarget}"; register_shutdown_function("unlink", $newTarget); } $output = array(); exec($unoconv, $output, $return); if (!$pipe) { $out = implode("\n", $output); $enc = 'ISO-8859-1'; $asciiString = iconv($enc, 'ASCII//TRANSLIT//IGNORE', $out); $doc->addField(Zend_Search_Lucene_Field::unStored("body", $asciiString)); } else { $ext = "pdf"; } } $pdftotext = $this->getFilteredOption("PDFTOTEXT"); if ($parseContent && !empty($pdftotext) && in_array($ext, array("pdf"))) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); if ($pipe && isset($newTarget) && is_file($newTarget)) { $realFile = $newTarget; } $cmd = $pdftotext . " " . escapeshellarg($realFile) . " -"; $output = array(); exec($cmd, $output, $return); $out = implode("\n", $output); $enc = 'UTF8'; $asciiString = iconv($enc, 'ASCII//TRANSLIT//IGNORE', $out); $doc->addField(Zend_Search_Lucene_Field::unStored("body", $asciiString)); } $index->addDocument($doc); return $doc; }
function get_native_handler($type) { switch ($type) { case 'text/plain': return function (FileGallery_Wrapper $wrapper) { return $wrapper->getContents(); }; case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': return function (FileGallery_Wrapper $wrapper) { $document = Zend_Search_Lucene_Document_Docx::loadDocxFile($wrapper->getReadableFile(), true); return $document->getField('body')->getUtf8Value(); }; case 'application/vnd.openxmlformats-officedocument.presentationml.presentation': return function (FileGallery_Wrapper $wrapper) { $document = Zend_Search_Lucene_Document_Pptx::loadPptxFile($wrapper->getReadableFile(), true); return $document->getField('body')->getUtf8Value(); }; case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': return function (FileGallery_Wrapper $wrapper) { $document = Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($wrapper->getReadableFile(), true); return $document->getField('body')->getUtf8Value(); }; } }
/** * Detect if document pointed to by $ps_filepath is a valid Word, Excel or PowerPoint XML (OpenOffice) document. * * @param string $ps_filepath The path to the file to analyze * @param string $ps_sig The signature (first 9 bytes) of the file * @return string WORD if the document is a Word doc, EXCEL if the document is an Excel doc, PPT if it is a PowerPoint doc or boolean false if it's not a valid Word or Excel XML (OpenOffice) file */ private function isWordExcelorPPTXMLdoc($ps_filepath, $ps_sig) { if (substr($ps_sig, 0, 2) == 'PK') { $o_unzip = new UnZipFile($ps_filepath); if (is_array($va_list = $o_unzip->getFileList())) { foreach ($va_list as $vs_file => $vn_size) { if (substr($vs_file, 0, 5) == 'word/') { try { $o_doc = Zend_Search_Lucene_Document_Docx::loadDocxFile($ps_filepath); $this->opa_metadata = array('WORD' => array('title' => $o_doc->getFieldUtf8Value('title'), 'subject' => $o_doc->getFieldUtf8Value('subject'), 'creator' => $o_doc->getFieldUtf8Value('creator'), 'created' => $o_doc->getFieldUtf8Value('created'), 'modified' => $o_doc->getFieldUtf8Value('modified'))); $this->handle['content'] = $o_doc->getFieldUtf8Value('body'); } catch (Exception $e) { // noop } return 'WORD'; } if (substr($vs_file, 0, 3) == 'xl/') { try { $o_doc = Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($ps_filepath); $this->opa_metadata = array('EXCEL' => array('title' => $o_doc->getFieldUtf8Value('title'), 'creator' => $o_doc->getFieldUtf8Value('creator'), 'created' => $o_doc->getFieldUtf8Value('created'), 'modified' => $o_doc->getFieldUtf8Value('modified'))); $this->handle['content'] = $o_doc->getFieldUtf8Value('body'); } catch (Exception $e) { // noop } return 'EXCEL'; } if (substr($vs_file, 0, 4) == 'ppt/') { try { $o_doc = Zend_Search_Lucene_Document_Pptx::loadPptxFile($ps_filepath); $this->opa_metadata = array('PPT' => array('title' => $o_doc->getFieldUtf8Value('title'), 'creator' => $o_doc->getFieldUtf8Value('creator'), 'created' => $o_doc->getFieldUtf8Value('created'), 'modified' => $o_doc->getFieldUtf8Value('modified'))); $this->handle['content'] = $o_doc->getFieldUtf8Value('body'); } catch (Exception $e) { // noop } return 'PPT'; } } } return false; } return false; }
/** * Extract data from a PDF document and add this to the Lucene index. * * @param \Zend_Search_Lucene_Proxy $Index The Lucene index object. * @param string $type ['html', 'docx', 'xsls', 'pptx', 'content'] * @param array $indexValues * @param string $locale * @param object $obj * @param string $pathFile The path to the PDF document. * * @return \Zend_Search_Lucene_Proxy * @access public * @static * @author Etienne de Longeaux <*****@*****.**> * @since 2012-06-11 */ public static function index(\Zend_Search_Lucene_Proxy $Index, $type, $indexValues = null, $locale = '', $obj = null, $pathFile = '') { // ignore invalid characters for lucene text search \Zend_Search_Lucene_Search_QueryParser::setDefaultEncoding('utf-8'); \Zend_Search_Lucene_Analysis_Analyzer::setDefault(new \Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive()); self::$_index = $Index; self::$_doc = null; switch ($type) { case "html": self::$_doc = \Zend_Search_Lucene_Document_Html::loadHtmlFile($pathFile, false); $indexValues['Key'] = filemtime($pathFile); $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body'); break; case "docx": self::$_doc = \Zend_Search_Lucene_Document_Docx::loadDocxFile($pathFile, false); $indexValues['Key'] = filemtime($pathFile); $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body'); break; case "xsls": self::$_doc = \Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($pathFile, false); $indexValues['Key'] = filemtime($pathFile); $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body'); break; case "pptx": self::$_doc = \Zend_Search_Lucene_Document_Pptx::loadPptxFile($pathFile, false); $indexValues['Key'] = filemtime($pathFile); $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body'); break; case "page": // we create a new instance of Zend_Search_Lucene_Document self::$_doc = \Zend_Search_Lucene_Document_Html::loadHTML($indexValues['Contents'], false); $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body'); break; } if (self::$_doc instanceof \Zend_Search_Lucene_Document) { // Remove all accens $indexValues['Contents'] = \Sfynx\ToolBundle\Util\PiStringManager::minusculesSansAccents($indexValues['Contents']); // Remove all doublons $indexValues['Contents'] = \Sfynx\ToolBundle\Util\PiStringManager::uniqueWord($indexValues['Contents']); // clean the content $indexValues['Contents'] = \Sfynx\ToolBundle\Util\PiStringManager::cleanContent($indexValues['Contents']); // Delete all stop words $stopWord = \Sfynx\ToolBundle\Util\PiStringManager::stopWord(strtolower($locale)); if ($stopWord) { $wordsIndex = explode(' ', $indexValues['Contents']); $diff = array_diff($wordsIndex, $stopWord); $indexValues['Contents'] = implode(' ', $diff); } // print_r($locale); // print_r('<br /><br /><br />'); // print_r(implode(' ', $wordsIndex)); // print_r('<br /><br /><br />'); // print_r(implode(' ', $stopWord)); // print_r('<br /><br /><br />'); // print_r($indexValues['Contents']); // print_r('<br /><br /><br />'); // If the document creation was sucessful then add it to our index. try { setlocale(LC_ALL, $locale); self::defaultAddFields($indexValues); self::addDocument(); // print_r($indexValues['Key']); // print_r('<br />'); // print_r($indexValues['Contents']); // print_r('<br /><br /><br />'); } catch (\Exception $e) { setlocale(LC_ALL, 'fr_FR'); self::defaultAddFields($indexValues); try { self::addDocument(); } catch (\Exception $e) { } } } // Return the Lucene index object. return self::$_index; }