/** * Extract data from a PDF document and add this to the Lucene index. * * @param string $pdfPath The path to the PDF document. * @param Zend_Search_Lucene_Proxy $luceneIndex The Lucene index object. * @return Zend_Search_Lucene_Proxy */ public static function index($pdfPath, $luceneIndex) { // Load the PDF document. $pdf = Zend_Pdf::load($pdfPath); $key = md5($pdfPath); /** * Set up array to contain the document index data. * The Filename will be used to retrive the document if it is found in * the search resutls. * The Key will be used to uniquely identify the document so we can * delete it from the search index. */ $indexValues = array('Filename' => $pdfPath, 'Key' => $key, 'Title' => '', 'Author' => '', 'Subject' => '', 'Keywords' => '', 'Creator' => '', 'Producer' => '', 'CreationDate' => '', 'ModDate' => '', 'Contents' => ''); // Go through each meta data item and add to index array. foreach ($pdf->properties as $meta => $metaValue) { switch ($meta) { case 'Title': $indexValues['Title'] = $pdf->properties['Title']; break; case 'Subject': $indexValues['Subject'] = $pdf->properties['Subject']; break; case 'Author': $indexValues['Author'] = $pdf->properties['Author']; break; case 'Keywords': $indexValues['Keywords'] = $pdf->properties['Keywords']; break; case 'CreationDate': $dateCreated = $pdf->properties['CreationDate']; $distance = substr($dateCreated, 16, 2); if (!is_long($distance)) { $distance = null; } // Convert date from the PDF format of D:20090731160351+01'00' $dateCreated = mktime(substr($dateCreated, 10, 2), substr($dateCreated, 12, 2), substr($dateCreated, 14, 2), substr($dateCreated, 6, 2), substr($dateCreated, 8, 2), substr($dateCreated, 2, 4), $distance); //distance $indexValues['CreationDate'] = date('Ymd', $dateCreated); break; case 'Date': $indexValues['Date'] = $pdf->properties['Date']; break; } } /** * Parse the contents of the PDF document and pass the text to the * contents item in the $indexValues array. */ $pdfParse = new App_Search_Helper_PdfParser(); $indexValues['Contents'] = $pdfParse->pdf2txt($pdf->render()); // Create the document using the values $doc = new App_Search_Lucene_Document($indexValues); if ($doc !== false) { // If the document creation was sucessful then add it to our index. $luceneIndex->addDocument($doc); } // Return the Lucene index object. return $luceneIndex; }
public function __construct() { try { parent::__construct(new Zend_Search_Lucene(self::INDEX_DIR, false)); } catch (Zend_Search_Lucene_Exception $e) { parent::__construct(new Zend_Search_Lucene(self::INDEX_DIR, true)); } Zend_Search_Lucene_Search_QueryParser::setDefaultEncoding("UTF-8"); }
/** * Add a document to the index. * * @return void * @access private * @static * * @author Etienne de Longeaux <*****@*****.**> * @since 2012-06-11 */ public static function addDocument() { // Search for documents with the same Key value. $term = new \Zend_Search_Lucene_Index_Term(self::$_doc->Key, 'Key'); $docIds = self::$_index->termDocs($term); // Delete any documents found. foreach ($docIds as $id) { self::$_index->delete($id); } if (self::$_doc instanceof \Zend_Search_Lucene_Document) { self::$_index->addDocument(self::$_doc); } }
/** * * @since 5-24-11 */ protected function find(Zend_Search_Lucene_Proxy $lucene, array $where_criteria) { $ret_list = array(); Zend_Search_Lucene::setResultSetLimit($where_criteria['limit'][0]); if (empty($where_criteria['sort'])) { $ret_list = $lucene->find($where_criteria['query']); } else { // http://framework.zend.com/manual/en/zend.search.lucene.searching.html#zend.search.lucene.searching.sorting $args = $where_criteria['sort']; array_unshift($args, $where_criteria['query']); $ret_list = call_user_func_array(array($lucene, 'find'), $args); } //if/else return $lucene->find($where_criteria['query']); }
/** * Object constructor * * @param Zend_Search_Lucene_Interface $index * @param string $module the Drupal module managing the index */ public function __construct(Zend_Search_Lucene_Interface $index, $module) { parent::__construct($index); $this->_module = (string) $module; }
/** * Destructor * * @return void */ public function __destruct() { if ($this->_index instanceof \Zend_Search_Lucene_Interface) { $this->_index->removeReference(); } }
/** * Search all pages that match the query. * * <code> * //$query = '(pi AND groupe AND partner*) OR pi-groupe'; * $query = " travers projet ference coin"; * $options = array( * 'searchBool' => true, * 'searchBoolType' => 'AND', * 'searchByMotif' => true, * 'setMinPrefixLength'=> 0, * 'getResultSetLimit' => 0, * 'searchFields' => array( * 0=> array('sortField'=>'Contents', 'sortType'=> SORT_STRING, 'sortOrder' => SORT_ASC), * 1=> array('sortField'=>'Key', 'sortType'=> SORT_NUMERIC, 'sortOrder' => SORT_DESC) * ), * ); * $result = $this->container->get('pi_app_admin.manager.search_lucene')->searchPagesByQuery($query, $options); * </code> * * @link http://framework.zend.com/manual/fr/zend.search.lucene.searching.html * @link http://framework.zend.com/manual/fr/learning.lucene.queries.html * @link http://framework.zend.com/manual/1.12/fr/zend.search.lucene.query-api.html * @param string $query The search query index file * @param array $options Options of the search query of the index file * @return array All Etags from pages that match the query. * @access public * * @author Etienne de Longeaux <*****@*****.**> * @since 2012-06-11 */ public function searchPagesByQuery($query = "Key:*", $options = null, $locale = '') { try { if (isset($options) && is_array($options) && count($options) >= 1) { $options_values = array_map(function ($key, $value) { if (in_array($value, array("true"))) { return 1; } elseif (in_array($value, array("false"))) { return 0; } elseif (!is_array($value) && preg_match_all("/[0-9]+/", $value, $nbrs, PREG_SET_ORDER)) { return intval($value); } else { return $value; } }, array_keys($options), array_values($options)); $options = array_combine(array_keys($options), $options_values); } if (empty($query)) { return null; } else { $query = $this->container->get('sfynx.tool.string_manager')->minusculesSansAccents($query); } if (empty($locale)) { $locale = $this->container->get('request')->getLocale(); } $options_default = array('searchBool' => true, 'searchBoolType' => 'OR', 'searchByMotif' => true, 'setMinPrefixLength' => 0, 'getResultSetLimit' => 0, 'searchFields' => '*', 'searchMaxResultByWord' => 5); if (is_array($options)) { $options = array_merge($options_default, $options); } else { $options = $options_default; } if ($options['searchBool']) { $q_string = $this->container->get('sfynx.tool.string_manager')->cleanWhitespace($query); $q_array = explode(' ', $q_string); if ($options['searchByMotif']) { $q_array = array_map(function ($value) { return $value . '*'; }, array_values($q_array)); } switch ($options['searchBoolType']) { case 'OR': $new_query = implode(' OR ', $q_array); break; case 'AND': $new_query = implode(' AND ', $q_array); break; default: break; } } else { $new_query = $query; } // Open the index. self::open($this->_indexPath); // Set minimum prefix length. \Zend_Search_Lucene_Search_Query_Wildcard::setMinPrefixLength($options['setMinPrefixLength']); // Set result set limit. \Zend_Search_Lucene::setResultSetLimit($options['getResultSetLimit']); // Performs a query against the index. if (is_array($options['searchFields']) && $query != "Key:*") { $fields_vars = "\$hits = self::\$_index->find(\$new_query,"; $i = 0; foreach ($options['searchFields'] as $key => $valuesField) { $sortField = $valuesField["sortField"]; if (isset($valuesField["sortType"]) && !empty($valuesField["sortType"])) { $sortType = $valuesField["sortType"]; } else { $sortType = SORT_STRING; } if (isset($valuesField["sortOrder"]) && !empty($valuesField["sortOrder"])) { $sortOrder = $valuesField["sortOrder"]; } else { $sortOrder = $valuesField["sortOrder"]; } if ($i == 0) { $fields_vars .= " \"{$sortField}\", {$sortType}, {$sortOrder}"; } else { $fields_vars .= ", \"{$sortField}\", {$sortType}, {$sortOrder}"; } $i++; } $fields_vars .= ");"; try { setlocale(LC_ALL, $locale); eval($fields_vars); // print_r($options); // print_r($new_query); // print_r('<br />'); // print_r($fields_vars); // //exit; } catch (\Exception $e) { setlocale(LC_ALL, 'fr_FR'); eval($fields_vars); } //eval("\$hits = self::\$_index->find(\$query, \"\$sortField\", \$sortType, \$sortOrder);"); //$hits = self::$_index->find($query, "Contents", SORT_STRING, SORT_DESC); //$hits = self::$_index->find(' *"férence"* ', "Contents", SORT_STRING, SORT_ASC); //$hits = self::$_index->find(' *"MOTIVTelecommunication"* OR *"Sophisticated"* ', "Contents", SORT_STRING, SORT_ASC); } else { try { setlocale(LC_ALL, $locale); $hits = self::$_index->find($new_query); } catch (\Exception $e) { setlocale(LC_ALL, 'fr_FR'); $hits = self::$_index->find($new_query); } } $result_search = null; if (isset($hits) && is_array($hits)) { foreach ($hits as $hit) { $field = $hit->getDocument()->getFieldNames(); if (in_array('Key', $field)) { $data['Key'] = $hit->getDocument()->Key; } else { $data['Key'] = ""; } if (in_array('Route', $field)) { $data['Route'] = $hit->getDocument()->Route; } else { $data['Route'] = ""; } if (in_array('Title', $field)) { $data['Title'] = utf8_decode($hit->getDocument()->Title); } else { $data['Title'] = ""; } if (in_array('Keywords', $field)) { $data['Keywords'] = utf8_decode($hit->getDocument()->Keywords); } else { $data['Keywords'] = ""; } if (in_array('ModDate', $field)) { $data['ModDate'] = $hit->getDocument()->ModDate; } else { $data['ModDate'] = ""; } $data['MaxResultByWord'] = $options['searchMaxResultByWord']; $result_search[] = $data; } } return $result_search; } catch (\Exception $e) { return array(); } }