public function testBinary() { $field = Zend_Search_Lucene_Field::Binary('field', 'value'); $this->assertEquals($field->boost, 1); $this->assertEquals($field->encoding, ''); $this->assertEquals($field->isBinary, true); $this->assertEquals($field->isIndexed, false); $this->assertEquals($field->isStored, true); $this->assertEquals($field->isTokenized, false); $this->assertEquals($field->name, 'field'); $this->assertEquals($field->value, 'value'); }
public function __construct(&$doc, &$data, $course_id, $group_id, $user_id, $path, $additional_keyset = null) { $encoding = 'UTF-8'; //document identification and indexing $this->addField(Zend_Search_Lucene_Field::Keyword('docid', $doc->docid, $encoding)); //document type : the name of the Moodle element that manages it $this->addField(Zend_Search_Lucene_Field::Keyword('doctype', $doc->documenttype, $encoding)); //allows subclassing information from complex modules. $this->addField(Zend_Search_Lucene_Field::Keyword('itemtype', $doc->itemtype, $encoding)); //caches the course context. $this->addField(Zend_Search_Lucene_Field::Keyword('course_id', $course_id, $encoding)); //caches the originator's group. $this->addField(Zend_Search_Lucene_Field::Keyword('group_id', $group_id, $encoding)); //caches the originator if any $this->addField(Zend_Search_Lucene_Field::Keyword('user_id', $user_id, $encoding)); // caches the context of this information. i-e, the context in which this information // is being produced/attached. Speeds up the "check for access" process as context in // which the information resides (a course, a module, a block, the site) is stable. $this->addField(Zend_Search_Lucene_Field::UnIndexed('context_id', $doc->contextid, $encoding)); //data for document $this->addField(Zend_Search_Lucene_Field::Text('title', $doc->title, $encoding)); $this->addField(Zend_Search_Lucene_Field::Text('author', $doc->author, $encoding)); $this->addField(Zend_Search_Lucene_Field::UnStored('contents', $doc->contents, $encoding)); $this->addField(Zend_Search_Lucene_Field::UnIndexed('url', $doc->url, $encoding)); $this->addField(Zend_Search_Lucene_Field::UnIndexed('date', $doc->date, $encoding)); //additional data added on a per-module basis $this->addField(Zend_Search_Lucene_Field::Binary('data', serialize($data))); // adding a path allows the document to know where to find specific library calls // for checking access to a module or block content. The Lucene records should only // be responsible to bring back to that call sufficient and consistent information // in order to perform the check. $this->addField(Zend_Search_Lucene_Field::UnIndexed('path', $path, $encoding)); /* // adding a capability set required for viewing. -1 if no capability required. // the capability required for viewing is depending on the local situation // of the document. each module should provide this information when pushing // out search document structure. Although capability model should be kept flat // there is no exclusion some module or block developpers use logical combinations // of multiple capabilities in their code. This possibility should be left open here. $this->addField(Zend_Search_Lucene_Field::UnIndexed('capabilities', $caps)); */ /* // Additional key set allows a module to ask for extensible criteria based search // depending on the module internal needs. */ if (!empty($additional_keyset)) { foreach ($additional_keyset as $keyname => $keyvalue) { $this->addField(Zend_Search_Lucene_Field::Keyword($keyname, $keyvalue, $encoding)); } } }
/** * Add an entry * * @param Searchable $item Model implememting Searchable interface * @param bool $create_new whether or not to create new index when adding item - only used when index is rebuilt * @return Search return this instance for method chaining */ public function add($item, $create_new = FALSE) { // ensure item implements Searchable interface if (!is_a($item, "Searchable")) { throw new Kohana_User_Exception('Invalid Object', 'Object must implement Searchable Interface'); } if (!$create_new) { $this->open_index(); } $doc = new Zend_Search_Lucene_Document(); // get indexable fields; $fields = $item->get_indexable_fields(); // index the object type - this allows search results to be grouped/searched by type $doc->addField(Zend_Search_Lucene_Field::Keyword('type', $item->get_type())); // index the object's id - to avoid any confusion, we call it 'identifier' as Lucene uses 'id' attribute internally. $doc->addField(Zend_Search_Lucene_Field::UnIndexed('identifier', $item->get_identifier())); // store, but don't index or tokenize // index the object type plus identifier - this gives us a unique identifier for later retrieval - e.g. to delete $doc->addField(Zend_Search_Lucene_Field::Keyword('uid', $item->get_unique_identifier())); // index all fields that have been identified by Interface foreach ($fields as $field) { // get attribute value from model $value = $item->__get($field->name); // html decode value if required $value = $field->html_decode ? htmlspecialchars_decode($value) : $value; // add field value based on type switch ($field->type) { case Searchable::KEYWORD: $doc->addField(Zend_Search_Lucene_Field::Keyword($field->name, $value)); break; case Searchable::UNINDEXED: $doc->addField(Zend_Search_Lucene_Field::UnIndexed($field->name, $value)); break; case Searchable::BINARY: $doc->addField(Zend_Search_Lucene_Field::Binary($field->name, $value)); break; case Searchable::TEXT: $doc->addField(Zend_Search_Lucene_Field::Text($field->name, $value)); break; case Searchable::UNSTORED: $doc->addField(Zend_Search_Lucene_Field::UnStored($field->name, $value)); break; } } $this->index->addDocument($doc); // return this so we can have chainable methods return $this; }
/** * @param AJXP_Node $ajxpNode * @param Zend_Search_Lucene_Interface $index * @throws Exception * @return Zend_Search_Lucene_Document */ public function createIndexedDocument($ajxpNode, &$index) { if (!empty($this->metaFields)) { $ajxpNode->loadNodeInfo(false, false, "all"); } else { $ajxpNode->loadNodeInfo(); } $ext = strtolower(pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION)); $parseContent = $this->indexContent; if ($parseContent && $ajxpNode->bytesize > $this->getFilteredOption("PARSE_CONTENT_MAX_SIZE")) { $parseContent = false; } if ($parseContent && in_array($ext, explode(",", $this->getFilteredOption("PARSE_CONTENT_HTML")))) { $doc = @Zend_Search_Lucene_Document_Html::loadHTMLFile($ajxpNode->getUrl()); } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Docx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Docx::loadDocxFile($realFile); } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Pptx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Pptx::loadPptxFile($realFile); } elseif ($parseContent && $ext == "xlsx" && class_exists("Zend_Search_Lucene_Document_Xlsx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($realFile); } else { $doc = new Zend_Search_Lucene_Document(); } if ($doc == null) { throw new Exception("Could not load document"); } $doc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl()), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath())), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Text("basename", basename($ajxpNode->getPath())), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_node", "yes"), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "shared")); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_modiftime", date("Ymd", $ajxpNode->ajxp_modiftime))); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_bytesize", $ajxpNode->bytesize)); $ajxpMime = $ajxpNode->ajxp_mime; if (empty($ajxpMime)) { $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION))); } else { $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", $ajxpNode->ajxp_mime)); } // Store a cached copy of the metadata $serializedMeta = base64_encode(serialize($ajxpNode->metadata)); $doc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta)); if (isset($ajxpNode->indexableMetaKeys["shared"])) { foreach ($ajxpNode->indexableMetaKeys["shared"] as $sharedField) { if ($ajxpNode->{$sharedField}) { $doc->addField(Zend_search_Lucene_Field::keyword($sharedField, $ajxpNode->{$sharedField})); } } } foreach ($this->metaFields as $field) { if ($ajxpNode->{$field} != null) { $doc->addField(Zend_Search_Lucene_Field::Text("ajxp_meta_{$field}", $ajxpNode->{$field}), SystemTextEncoding::getEncoding()); } } if (isset($ajxpNode->indexableMetaKeys["user"]) && count($ajxpNode->indexableMetaKeys["user"]) && AuthService::usersEnabled() && AuthService::getLoggedUser() != null) { $privateDoc = new Zend_Search_Lucene_Document(); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl(), SystemTextEncoding::getEncoding())); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath()), SystemTextEncoding::getEncoding())); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "user")); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_user", AuthService::getLoggedUser()->getId())); foreach ($ajxpNode->indexableMetaKeys["user"] as $userField) { if ($ajxpNode->{$userField}) { $privateDoc->addField(Zend_search_Lucene_Field::keyword($userField, $ajxpNode->{$userField})); } } $privateDoc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta)); $index->addDocument($privateDoc); } if ($parseContent) { $body = $this->extractIndexableContent($ajxpNode); if (!empty($body)) { $doc->addField(Zend_Search_Lucene_Field::unStored("body", $body)); } } $index->addDocument($doc); return $doc; }
/** * @param AJXP_Node $ajxpNode * @param Zend_Search_Lucene_Interface $index * @throws Exception * @return Zend_Search_Lucene_Document */ public function createIndexedDocument($ajxpNode, &$index) { $ajxpNode->loadNodeInfo(); $ext = strtolower(pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION)); $parseContent = $this->indexContent; if ($parseContent && $ajxpNode->bytesize > $this->getFilteredOption("PARSE_CONTENT_MAX_SIZE")) { $parseContent = false; } if ($parseContent && in_array($ext, explode(",", $this->getFilteredOption("PARSE_CONTENT_HTML")))) { $doc = @Zend_Search_Lucene_Document_Html::loadHTMLFile($ajxpNode->getUrl()); } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Docx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Docx::loadDocxFile($realFile); } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Pptx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Pptx::loadPptxFile($realFile); } elseif ($parseContent && $ext == "xlsx" && class_exists("Zend_Search_Lucene_Document_Xlsx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($realFile); } else { $doc = new Zend_Search_Lucene_Document(); } if ($doc == null) { throw new Exception("Could not load document"); } $doc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl()), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath())), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Text("basename", basename($ajxpNode->getPath())), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_node", "yes"), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "shared")); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_modiftime", date("Ymd", $ajxpNode->ajxp_modiftime))); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_bytesize", $ajxpNode->bytesize)); $ajxpMime = $ajxpNode->ajxp_mime; if (empty($ajxpMime)) { $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION))); } else { $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", $ajxpNode->ajxp_mime)); } // Store a cached copy of the metadata $serializedMeta = base64_encode(serialize($ajxpNode->metadata)); $doc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta)); if (isset($ajxpNode->indexableMetaKeys["shared"])) { foreach ($ajxpNode->indexableMetaKeys["shared"] as $sharedField) { if ($ajxpNode->{$sharedField}) { $doc->addField(Zend_search_Lucene_Field::keyword($sharedField, $ajxpNode->{$sharedField})); } } } foreach ($this->metaFields as $field) { if ($ajxpNode->{$field} != null) { $doc->addField(Zend_Search_Lucene_Field::Text("ajxp_meta_{$field}", $ajxpNode->{$field}), SystemTextEncoding::getEncoding()); } } if (isset($ajxpNode->indexableMetaKeys["user"]) && count($ajxpNode->indexableMetaKeys["user"]) && AuthService::usersEnabled() && AuthService::getLoggedUser() != null) { $privateDoc = new Zend_Search_Lucene_Document(); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl(), SystemTextEncoding::getEncoding())); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath()), SystemTextEncoding::getEncoding())); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "user")); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_user", AuthService::getLoggedUser()->getId())); foreach ($ajxpNode->indexableMetaKeys["user"] as $userField) { if ($ajxpNode->{$userField}) { $privateDoc->addField(Zend_search_Lucene_Field::keyword($userField, $ajxpNode->{$userField})); } } $privateDoc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta)); $index->addDocument($privateDoc); } if ($parseContent && in_array($ext, explode(",", $this->getFilteredOption("PARSE_CONTENT_TXT")))) { $doc->addField(Zend_Search_Lucene_Field::unStored("body", file_get_contents($ajxpNode->getUrl()))); } $unoconv = $this->getFilteredOption("UNOCONV"); $pipe = false; if ($parseContent && !empty($unoconv) && in_array($ext, array("doc", "odt", "xls", "ods"))) { $targetExt = "txt"; if (in_array($ext, array("xls", "ods"))) { $targetExt = "csv"; } else { if (in_array($ext, array("odp", "ppt"))) { $targetExt = "pdf"; $pipe = true; } } $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $unoconv = "HOME=" . AJXP_Utils::getAjxpTmpDir() . " " . $unoconv . " --stdout -f {$targetExt} " . escapeshellarg($realFile); if ($pipe) { $newTarget = str_replace(".{$ext}", ".pdf", $realFile); $unoconv .= " > {$newTarget}"; register_shutdown_function("unlink", $newTarget); } $output = array(); exec($unoconv, $output, $return); if (!$pipe) { $out = implode("\n", $output); $enc = 'ISO-8859-1'; $asciiString = iconv($enc, 'ASCII//TRANSLIT//IGNORE', $out); $doc->addField(Zend_Search_Lucene_Field::unStored("body", $asciiString)); } else { $ext = "pdf"; } } $pdftotext = $this->getFilteredOption("PDFTOTEXT"); if ($parseContent && !empty($pdftotext) && in_array($ext, array("pdf"))) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); if ($pipe && isset($newTarget) && is_file($newTarget)) { $realFile = $newTarget; } $cmd = $pdftotext . " " . escapeshellarg($realFile) . " -"; $output = array(); exec($cmd, $output, $return); $out = implode("\n", $output); $enc = 'UTF8'; $asciiString = iconv($enc, 'ASCII//TRANSLIT//IGNORE', $out); $doc->addField(Zend_Search_Lucene_Field::unStored("body", $asciiString)); } $index->addDocument($doc); return $doc; }
/** * Factory to obtain the search fields. * @param string $field The type of field * @param string $name To name to use * @param string $contents The contents for the field to have. * @return mixed The requested type. */ protected function getLuceneField($field, $name, $contents) { switch (strtolower($field)) { case 'keyword': return Zend_Search_Lucene_Field::Keyword($name, $contents, $this->getSearch()->getParameter('encoding')); case 'unindexed': return Zend_Search_Lucene_Field::UnIndexed($name, $contents, $this->getSearch()->getParameter('encoding')); case 'binary': return Zend_Search_Lucene_Field::Binary($name, $contents); case 'text': return Zend_Search_Lucene_Field::Text($name, $contents, $this->getSearch()->getParameter('encoding')); case 'unstored': return Zend_Search_Lucene_Field::UnStored($name, $contents, $this->getSearch()->getParameter('encoding')); case 'index term': return new Zend_Search_Lucene_Index_Term($contents, $name); default: throw new sfLuceneIndexerException(sprintf('Unknown field "%s" in factory', $field)); } }