Example #1
0
 /**
  * Extract data from a PDF document and add this to the Lucene index.
  *
  * @param string $pdfPath                       The path to the PDF document.
  * @param Zend_Search_Lucene_Proxy $luceneIndex The Lucene index object.
  * @return Zend_Search_Lucene_Proxy
  */
 public static function index($pdfPath, $luceneIndex)
 {
     // Load the PDF document.
     $pdf = Zend_Pdf::load($pdfPath);
     $key = md5($pdfPath);
     /**
      * Set up array to contain the document index data.
      * The Filename will be used to retrive the document if it is found in
      * the search resutls.
      * The Key will be used to uniquely identify the document so we can
      * delete it from the search index.
      */
     $indexValues = array('Filename' => $pdfPath, 'Key' => $key, 'Title' => '', 'Author' => '', 'Subject' => '', 'Keywords' => '', 'Creator' => '', 'Producer' => '', 'CreationDate' => '', 'ModDate' => '', 'Contents' => '');
     // Go through each meta data item and add to index array.
     foreach ($pdf->properties as $meta => $metaValue) {
         switch ($meta) {
             case 'Title':
                 $indexValues['Title'] = $pdf->properties['Title'];
                 break;
             case 'Subject':
                 $indexValues['Subject'] = $pdf->properties['Subject'];
                 break;
             case 'Author':
                 $indexValues['Author'] = $pdf->properties['Author'];
                 break;
             case 'Keywords':
                 $indexValues['Keywords'] = $pdf->properties['Keywords'];
                 break;
             case 'CreationDate':
                 $dateCreated = $pdf->properties['CreationDate'];
                 $distance = substr($dateCreated, 16, 2);
                 if (!is_long($distance)) {
                     $distance = null;
                 }
                 // Convert date from the PDF format of D:20090731160351+01'00'
                 $dateCreated = mktime(substr($dateCreated, 10, 2), substr($dateCreated, 12, 2), substr($dateCreated, 14, 2), substr($dateCreated, 6, 2), substr($dateCreated, 8, 2), substr($dateCreated, 2, 4), $distance);
                 //distance
                 $indexValues['CreationDate'] = date('Ymd', $dateCreated);
                 break;
             case 'Date':
                 $indexValues['Date'] = $pdf->properties['Date'];
                 break;
         }
     }
     /**
      * Parse the contents of the PDF document and pass the text to the
      * contents item in the $indexValues array.
      */
     $pdfParse = new App_Search_Helper_PdfParser();
     $indexValues['Contents'] = $pdfParse->pdf2txt($pdf->render());
     // Create the document using the values
     $doc = new App_Search_Lucene_Document($indexValues);
     if ($doc !== false) {
         // If the document creation was sucessful then add it to our index.
         $luceneIndex->addDocument($doc);
     }
     // Return the Lucene index object.
     return $luceneIndex;
 }
Example #2
0
 public function __construct()
 {
     try {
         parent::__construct(new Zend_Search_Lucene(self::INDEX_DIR, false));
     } catch (Zend_Search_Lucene_Exception $e) {
         parent::__construct(new Zend_Search_Lucene(self::INDEX_DIR, true));
     }
     Zend_Search_Lucene_Search_QueryParser::setDefaultEncoding("UTF-8");
 }
Example #3
0
 /**
  * Add a document to the index.
  * 
  * @return void
  * @access    private
  * @static
  *
  * @author Etienne de Longeaux <*****@*****.**>
  * @since 2012-06-11
  */
 public static function addDocument()
 {
     // Search for documents with the same Key value.
     $term = new \Zend_Search_Lucene_Index_Term(self::$_doc->Key, 'Key');
     $docIds = self::$_index->termDocs($term);
     // Delete any documents found.
     foreach ($docIds as $id) {
         self::$_index->delete($id);
     }
     if (self::$_doc instanceof \Zend_Search_Lucene_Document) {
         self::$_index->addDocument(self::$_doc);
     }
 }
Example #4
0
 /**
  *
  *  @since  5-24-11
  */
 protected function find(Zend_Search_Lucene_Proxy $lucene, array $where_criteria)
 {
     $ret_list = array();
     Zend_Search_Lucene::setResultSetLimit($where_criteria['limit'][0]);
     if (empty($where_criteria['sort'])) {
         $ret_list = $lucene->find($where_criteria['query']);
     } else {
         // http://framework.zend.com/manual/en/zend.search.lucene.searching.html#zend.search.lucene.searching.sorting
         $args = $where_criteria['sort'];
         array_unshift($args, $where_criteria['query']);
         $ret_list = call_user_func_array(array($lucene, 'find'), $args);
     }
     //if/else
     return $lucene->find($where_criteria['query']);
 }
Example #5
0
 /**
  * Object constructor
  *
  * @param Zend_Search_Lucene_Interface $index
  * @param string $module the Drupal module managing the index
  */
 public function __construct(Zend_Search_Lucene_Interface $index, $module)
 {
     parent::__construct($index);
     $this->_module = (string) $module;
 }
 /**
  * Destructor
  * 
  * @return void
  */
 public function __destruct()
 {
     if ($this->_index instanceof \Zend_Search_Lucene_Interface) {
         $this->_index->removeReference();
     }
 }
 /**
  * Search all pages that match the query.
  *
  * <code>
  *  //$query = '(pi AND groupe AND partner*) OR pi-groupe';
  *    $query   = " travers projet ference coin";
  *    $options = array(
  *        'searchBool'         => true,
  *        'searchBoolType'     => 'AND',
  *        'searchByMotif'     => true,
  *        'setMinPrefixLength'=> 0,
  *        'getResultSetLimit' => 0,
  *        'searchFields'         => array(
  *                                    0=> array('sortField'=>'Contents', 'sortType'=> SORT_STRING, 'sortOrder' => SORT_ASC),
  *                                    1=> array('sortField'=>'Key', 'sortType'=> SORT_NUMERIC, 'sortOrder' => SORT_DESC)
  *                                ),
  *    );
  *    $result = $this->container->get('pi_app_admin.manager.search_lucene')->searchPagesByQuery($query, $options);
  * </code>
  *
  * @link http://framework.zend.com/manual/fr/zend.search.lucene.searching.html
  * @link http://framework.zend.com/manual/fr/learning.lucene.queries.html
  * @link http://framework.zend.com/manual/1.12/fr/zend.search.lucene.query-api.html
  * @param string $query        The search query index file
  * @param array     $options    Options of the search query of the index file
  * @return array            All Etags from pages that match the query.
  * @access    public
  *
  * @author Etienne de Longeaux <*****@*****.**>
  * @since 2012-06-11
  */
 public function searchPagesByQuery($query = "Key:*", $options = null, $locale = '')
 {
     try {
         if (isset($options) && is_array($options) && count($options) >= 1) {
             $options_values = array_map(function ($key, $value) {
                 if (in_array($value, array("true"))) {
                     return 1;
                 } elseif (in_array($value, array("false"))) {
                     return 0;
                 } elseif (!is_array($value) && preg_match_all("/[0-9]+/", $value, $nbrs, PREG_SET_ORDER)) {
                     return intval($value);
                 } else {
                     return $value;
                 }
             }, array_keys($options), array_values($options));
             $options = array_combine(array_keys($options), $options_values);
         }
         if (empty($query)) {
             return null;
         } else {
             $query = $this->container->get('sfynx.tool.string_manager')->minusculesSansAccents($query);
         }
         if (empty($locale)) {
             $locale = $this->container->get('request')->getLocale();
         }
         $options_default = array('searchBool' => true, 'searchBoolType' => 'OR', 'searchByMotif' => true, 'setMinPrefixLength' => 0, 'getResultSetLimit' => 0, 'searchFields' => '*', 'searchMaxResultByWord' => 5);
         if (is_array($options)) {
             $options = array_merge($options_default, $options);
         } else {
             $options = $options_default;
         }
         if ($options['searchBool']) {
             $q_string = $this->container->get('sfynx.tool.string_manager')->cleanWhitespace($query);
             $q_array = explode(' ', $q_string);
             if ($options['searchByMotif']) {
                 $q_array = array_map(function ($value) {
                     return $value . '*';
                 }, array_values($q_array));
             }
             switch ($options['searchBoolType']) {
                 case 'OR':
                     $new_query = implode(' OR ', $q_array);
                     break;
                 case 'AND':
                     $new_query = implode(' AND ', $q_array);
                     break;
                 default:
                     break;
             }
         } else {
             $new_query = $query;
         }
         // Open the index.
         self::open($this->_indexPath);
         // Set minimum prefix length.
         \Zend_Search_Lucene_Search_Query_Wildcard::setMinPrefixLength($options['setMinPrefixLength']);
         // Set result set limit.
         \Zend_Search_Lucene::setResultSetLimit($options['getResultSetLimit']);
         // Performs a query against the index.
         if (is_array($options['searchFields']) && $query != "Key:*") {
             $fields_vars = "\$hits = self::\$_index->find(\$new_query,";
             $i = 0;
             foreach ($options['searchFields'] as $key => $valuesField) {
                 $sortField = $valuesField["sortField"];
                 if (isset($valuesField["sortType"]) && !empty($valuesField["sortType"])) {
                     $sortType = $valuesField["sortType"];
                 } else {
                     $sortType = SORT_STRING;
                 }
                 if (isset($valuesField["sortOrder"]) && !empty($valuesField["sortOrder"])) {
                     $sortOrder = $valuesField["sortOrder"];
                 } else {
                     $sortOrder = $valuesField["sortOrder"];
                 }
                 if ($i == 0) {
                     $fields_vars .= " \"{$sortField}\", {$sortType}, {$sortOrder}";
                 } else {
                     $fields_vars .= ", \"{$sortField}\", {$sortType}, {$sortOrder}";
                 }
                 $i++;
             }
             $fields_vars .= ");";
             try {
                 setlocale(LC_ALL, $locale);
                 eval($fields_vars);
                 //                print_r($options);
                 //                 print_r($new_query);
                 //                 print_r('<br />');
                 //                 print_r($fields_vars);
                 //                 //exit;
             } catch (\Exception $e) {
                 setlocale(LC_ALL, 'fr_FR');
                 eval($fields_vars);
             }
             //eval("\$hits = self::\$_index->find(\$query, \"\$sortField\", \$sortType, \$sortOrder);");
             //$hits = self::$_index->find($query, "Contents", SORT_STRING, SORT_DESC);
             //$hits = self::$_index->find(' *"férence"* ', "Contents", SORT_STRING, SORT_ASC);
             //$hits = self::$_index->find(' *"MOTIVTelecommunication"* OR *"Sophisticated"* ', "Contents", SORT_STRING, SORT_ASC);
         } else {
             try {
                 setlocale(LC_ALL, $locale);
                 $hits = self::$_index->find($new_query);
             } catch (\Exception $e) {
                 setlocale(LC_ALL, 'fr_FR');
                 $hits = self::$_index->find($new_query);
             }
         }
         $result_search = null;
         if (isset($hits) && is_array($hits)) {
             foreach ($hits as $hit) {
                 $field = $hit->getDocument()->getFieldNames();
                 if (in_array('Key', $field)) {
                     $data['Key'] = $hit->getDocument()->Key;
                 } else {
                     $data['Key'] = "";
                 }
                 if (in_array('Route', $field)) {
                     $data['Route'] = $hit->getDocument()->Route;
                 } else {
                     $data['Route'] = "";
                 }
                 if (in_array('Title', $field)) {
                     $data['Title'] = utf8_decode($hit->getDocument()->Title);
                 } else {
                     $data['Title'] = "";
                 }
                 if (in_array('Keywords', $field)) {
                     $data['Keywords'] = utf8_decode($hit->getDocument()->Keywords);
                 } else {
                     $data['Keywords'] = "";
                 }
                 if (in_array('ModDate', $field)) {
                     $data['ModDate'] = $hit->getDocument()->ModDate;
                 } else {
                     $data['ModDate'] = "";
                 }
                 $data['MaxResultByWord'] = $options['searchMaxResultByWord'];
                 $result_search[] = $data;
             }
         }
         return $result_search;
     } catch (\Exception $e) {
         return array();
     }
 }