Exemple #1
0
 /**
  * Reindex PDF media by content for in-PDF search
  */
 public static function reindex_pdfs($po_opts = null)
 {
     require_once __CA_LIB_DIR__ . "/core/Db.php";
     require_once __CA_MODELS_DIR__ . "/ca_object_representations.php";
     if (!caPDFMinerInstalled()) {
         CLIUtils::addError(_t("Can't reindex PDFs: PDFMiner is not installed."));
         return false;
     }
     $o_db = new Db();
     $t_rep = new ca_object_representations();
     $t_rep->setMode(ACCESS_WRITE);
     $va_versions = array("original");
     $va_kinds = ($vs_kinds = $po_opts->getOption("kinds")) ? explode(",", $vs_kinds) : array();
     if (!is_array($va_kinds) || !sizeof($va_kinds)) {
         $va_kinds = array('all');
     }
     $va_kinds = array_map('strtolower', $va_kinds);
     if (in_array('all', $va_kinds) || in_array('ca_object_representations', $va_kinds)) {
         if (!($vn_start = (int) $po_opts->getOption('start_id'))) {
             $vn_start = null;
         }
         if (!($vn_end = (int) $po_opts->getOption('end_id'))) {
             $vn_end = null;
         }
         if ($vn_id = (int) $po_opts->getOption('id')) {
             $vn_start = $vn_id;
             $vn_end = $vn_id;
         }
         $va_ids = array();
         if ($vs_ids = (string) $po_opts->getOption('ids')) {
             if (sizeof($va_tmp = explode(",", $vs_ids))) {
                 foreach ($va_tmp as $vn_id) {
                     if ((int) $vn_id > 0) {
                         $va_ids[] = (int) $vn_id;
                     }
                 }
             }
         }
         $vs_sql_where = null;
         $va_params = array();
         if (sizeof($va_ids)) {
             $vs_sql_where = "WHERE representation_id IN (?)";
             $va_params[] = $va_ids;
         } else {
             if ($vn_start > 0 && $vn_end > 0 && $vn_start <= $vn_end || $vn_start > 0 && $vn_end == null) {
                 $vs_sql_where = "WHERE representation_id >= ?";
                 $va_params[] = $vn_start;
                 if ($vn_end) {
                     $vs_sql_where .= " AND representation_id <= ?";
                     $va_params[] = $vn_end;
                 }
             }
         }
         if ($vs_sql_where) {
             $vs_sql_where .= " AND mimetype = 'application/pdf'";
         } else {
             $vs_sql_where = " WHERE mimetype = 'application/pdf'";
         }
         $qr_reps = $o_db->query("\n\t\t\t\t\tSELECT * \n\t\t\t\t\tFROM ca_object_representations \n\t\t\t\t\t{$vs_sql_where}\n\t\t\t\t\tORDER BY representation_id\n\t\t\t\t", $va_params);
         print CLIProgressBar::start($qr_reps->numRows(), _t('Reindexing PDF representations'));
         $vn_rep_table_num = $t_rep->tableNum();
         while ($qr_reps->nextRow()) {
             $va_media_info = $qr_reps->getMediaInfo('media');
             $vs_original_filename = $va_media_info['ORIGINAL_FILENAME'];
             print CLIProgressBar::next(1, _t("Reindexing PDF %1", $vs_original_filename ? $vs_original_filename . " (" . $qr_reps->get('representation_id') . ")" : $qr_reps->get('representation_id')));
             $t_rep->load($qr_reps->get('representation_id'));
             $vn_rep_id = $t_rep->getPrimaryKey();
             $m = new Media();
             if ($m->read($vs_path = $t_rep->getMediaPath('media', 'original')) && is_array($va_locs = $m->getExtractedTextLocations())) {
                 MediaContentLocationIndexer::clear($vn_rep_table_num, $vn_rep_id);
                 foreach ($va_locs as $vs_content => $va_loc_list) {
                     foreach ($va_loc_list as $va_loc) {
                         MediaContentLocationIndexer::index($vn_rep_table_num, $vn_rep_id, $vs_content, $va_loc['p'], $va_loc['x1'], $va_loc['y1'], $va_loc['x2'], $va_loc['y2']);
                     }
                 }
                 MediaContentLocationIndexer::write();
             } else {
                 //CLIUtils::addError(_t("[Warning] No content to reindex for PDF representation: %1", $vs_path));
             }
         }
         print CLIProgressBar::finish();
     }
     if (in_array('all', $va_kinds) || in_array('ca_attributes', $va_kinds)) {
         // get all Media elements
         $va_elements = ca_metadata_elements::getElementsAsList(false, null, null, true, false, true, array(16));
         // 16=media
         $qr_c = $o_db->query("\n\t\t\t\t\tSELECT count(*) c \n\t\t\t\t\tFROM ca_attribute_values\n\t\t\t\t\tWHERE\n\t\t\t\t\t\telement_id in (?)\n\t\t\t\t", caExtractValuesFromArrayList($va_elements, 'element_id', array('preserveKeys' => false)));
         if ($qr_c->nextRow()) {
             $vn_count = $qr_c->get('c');
         } else {
             $vn_count = 0;
         }
         $t_attr_val = new ca_attribute_values();
         $vn_attr_table_num = $t_attr_val->tableNum();
         print CLIProgressBar::start($vn_count, _t('Reindexing metadata attribute media'));
         foreach ($va_elements as $vs_element_code => $va_element_info) {
             $qr_vals = $o_db->query("SELECT value_id FROM ca_attribute_values WHERE element_id = ?", (int) $va_element_info['element_id']);
             $va_vals = $qr_vals->getAllFieldValues('value_id');
             foreach ($va_vals as $vn_value_id) {
                 $t_attr_val = new ca_attribute_values($vn_value_id);
                 if ($t_attr_val->getPrimaryKey()) {
                     $t_attr_val->setMode(ACCESS_WRITE);
                     $t_attr_val->useBlobAsMediaField(true);
                     $va_media_info = $t_attr_val->getMediaInfo('value_blob');
                     $vs_original_filename = $va_media_info['ORIGINAL_FILENAME'];
                     if (!is_array($va_media_info) || $va_media_info['MIMETYPE'] !== 'application/pdf') {
                         continue;
                     }
                     print CLIProgressBar::next(1, _t("Reindexing %1", $vs_original_filename ? $vs_original_filename . " ({$vn_value_id})" : $vn_value_id));
                     $m = new Media();
                     if ($m->read($vs_path = $t_attr_val->getMediaPath('value_blob', 'original')) && is_array($va_locs = $m->getExtractedTextLocations())) {
                         MediaContentLocationIndexer::clear($vn_attr_table_num, $vn_attr_table_num);
                         foreach ($va_locs as $vs_content => $va_loc_list) {
                             foreach ($va_loc_list as $va_loc) {
                                 MediaContentLocationIndexer::index($vn_attr_table_num, $vn_value_id, $vs_content, $va_loc['p'], $va_loc['x1'], $va_loc['y1'], $va_loc['x2'], $va_loc['y2']);
                             }
                         }
                         MediaContentLocationIndexer::write();
                     } else {
                         //CLIUtils::addError(_t("[Warning] No content to reindex for PDF in metadata attribute: %1", $vs_path));
                     }
                 }
             }
         }
         print CLIProgressBar::finish();
     }
     return true;
 }
Exemple #2
0
 public function read($ps_filepath)
 {
     if (is_array($this->handle) && $this->handle["filepath"] == $ps_filepath) {
         // noop
     } else {
         if (!file_exists($ps_filepath)) {
             $this->postError(1650, _t("File %1 does not exist", $ps_filepath), "WLPlugPDFWand->read()");
             $this->handle = "";
             $this->filepath = "";
             return false;
         }
         if (!$this->divineFileFormat($ps_filepath)) {
             $this->postError(1650, _t("File %1 is not a PDF", $ps_filepath), "WLPlugPDFWand->read()");
             $this->handle = "";
             $this->filepath = "";
             return false;
         }
     }
     $this->filepath = $ps_filepath;
     // Try to extract positions of text using PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html)
     if (caPDFMinerInstalled($this->ops_pdfminer_path)) {
         // Try to extract text
         $vs_tmp_filename = tempnam('/tmp', 'CA_PDF_TEXT');
         exec($this->ops_pdfminer_path . '/pdf2txt.py -t text ' . caEscapeShellArg($ps_filepath) . ' > ' . caEscapeShellArg($vs_tmp_filename));
         $vs_extracted_text = file_get_contents($vs_tmp_filename);
         $this->handle['content'] = $this->ohandle['content'] = $vs_extracted_text;
         @unlink($vs_tmp_filename);
         $vs_tmp_filename = tempnam('/tmp', 'CA_PDF_TEXT_LOCATIONS');
         exec($this->ops_pdfminer_path . '/pdf2txt.py -t xml ' . caEscapeShellArg($ps_filepath) . ' > ' . caEscapeShellArg($vs_tmp_filename));
         $xml = new XMLReader();
         if ($xml->open($vs_tmp_filename)) {
             // Structure of locations array is [<word>][] = array(page, x1, y1, x2, y2, size)
             $va_locations = array();
             $vn_current_page = null;
             $vs_text_line_content = '';
             $vs_page_content = '';
             $va_text_line_locs = array();
             $vb_in_text_element = false;
             $va_current_text_loc = null;
             $vs_indexing_regex = $this->opo_search_config->get('indexing_tokenizer_regex');
             while (@$xml->read()) {
                 switch ($xml->name) {
                     case 'page':
                         // new page
                         if ($xml->nodeType == XMLReader::END_ELEMENT) {
                             //$va_locations['__pages__'][$vn_current_page] = $vs_page_content;
                             $vs_page_content = '';
                             continue;
                         }
                         $vs_text_line_content = '';
                         $vn_current_page = (int) $xml->getAttribute('id');
                         break;
                     case 'textline':
                         if ($xml->nodeType == XMLReader::END_ELEMENT) {
                             // end of line
                             $vn_start = $vn_end = null;
                             $vs_acc = '';
                             for ($vn_i = 0; $vn_i < mb_strlen($vs_text_line_content); $vn_i++) {
                                 if (preg_match("![{$vs_indexing_regex}]!", $vs_text_line_content[$vn_i])) {
                                     // word boundary
                                     if ($vs_acc) {
                                         $vs_acc = mb_strtolower($vs_acc);
                                         $va_start = $va_text_line_locs[$vn_start];
                                         $va_end = $va_text_line_locs[$vn_end];
                                         $va_locations[$vs_acc][] = array('p' => $vn_current_page, 'x1' => $va_start['x1'], 'y1' => $va_start['y1'], 'x2' => $va_end['x2'], 'y2' => $va_end['y2']);
                                     }
                                     $vn_start = $vn_end = null;
                                     $vs_acc = '';
                                 } else {
                                     if (is_null($vn_start)) {
                                         $vn_start = $vn_i;
                                     }
                                     $vn_end = $vn_i;
                                     $vs_acc .= $vs_c = mb_substr($vs_text_line_content, $vn_i, 1);
                                 }
                             }
                         } else {
                             // new line of text
                             $vs_page_content .= $vs_text_line_content;
                             $vs_text_line_content = '';
                             $va_text_line_locs = array();
                         }
                         break;
                     case 'textbox':
                         if ($xml->nodeType == XMLReader::END_ELEMENT) {
                             $vs_page_content .= "\n";
                         }
                         break;
                     case 'text':
                         if ($vb_in_text_element = $xml->nodeType == XMLReader::ELEMENT) {
                             $va_tmp = explode(",", (string) $xml->getAttribute('bbox'));
                             $va_current_text_loc = array('x1' => $va_tmp[0], 'y1' => $va_tmp[1], 'x2' => $va_tmp[2], 'y2' => $va_tmp[3]);
                         } else {
                             $va_current_text_loc = null;
                         }
                         break;
                     case '#text':
                         // bit of text to record (usually a single character)
                         if ($vb_in_text_element) {
                             $va_current_text_loc['chars'] = mb_strlen((string) $xml->value);
                             $va_text_line_locs[mb_strlen($vs_text_line_content)] = $va_current_text_loc;
                             $vs_text_line_content .= (string) $xml->value;
                         }
                         break;
                 }
             }
         }
         $this->handle['content_by_location'] = $this->ohandle['content_by_location'] = $va_locations;
         @unlink($vs_tmp_filename);
     } else {
         // Try to extract text
         if (caMediaPluginPdftotextInstalled($this->ops_pdftotext_path)) {
             $vs_tmp_filename = tempnam('/tmp', 'CA_PDF_TEXT');
             exec($this->ops_pdftotext_path . ' -q -enc UTF-8 ' . caEscapeShellArg($ps_filepath) . ' ' . caEscapeShellArg($vs_tmp_filename));
             $vs_extracted_text = file_get_contents($vs_tmp_filename);
             $this->handle['content'] = $this->ohandle['content'] = $vs_extracted_text;
             @unlink($vs_tmp_filename);
         }
     }
     return true;
 }