/** * Reindex PDF media by content for in-PDF search */ public static function reindex_pdfs($po_opts = null) { require_once __CA_LIB_DIR__ . "/core/Db.php"; require_once __CA_MODELS_DIR__ . "/ca_object_representations.php"; if (!caPDFMinerInstalled()) { CLIUtils::addError(_t("Can't reindex PDFs: PDFMiner is not installed.")); return false; } $o_db = new Db(); $t_rep = new ca_object_representations(); $t_rep->setMode(ACCESS_WRITE); $va_versions = array("original"); $va_kinds = ($vs_kinds = $po_opts->getOption("kinds")) ? explode(",", $vs_kinds) : array(); if (!is_array($va_kinds) || !sizeof($va_kinds)) { $va_kinds = array('all'); } $va_kinds = array_map('strtolower', $va_kinds); if (in_array('all', $va_kinds) || in_array('ca_object_representations', $va_kinds)) { if (!($vn_start = (int) $po_opts->getOption('start_id'))) { $vn_start = null; } if (!($vn_end = (int) $po_opts->getOption('end_id'))) { $vn_end = null; } if ($vn_id = (int) $po_opts->getOption('id')) { $vn_start = $vn_id; $vn_end = $vn_id; } $va_ids = array(); if ($vs_ids = (string) $po_opts->getOption('ids')) { if (sizeof($va_tmp = explode(",", $vs_ids))) { foreach ($va_tmp as $vn_id) { if ((int) $vn_id > 0) { $va_ids[] = (int) $vn_id; } } } } $vs_sql_where = null; $va_params = array(); if (sizeof($va_ids)) { $vs_sql_where = "WHERE representation_id IN (?)"; $va_params[] = $va_ids; } else { if ($vn_start > 0 && $vn_end > 0 && $vn_start <= $vn_end || $vn_start > 0 && $vn_end == null) { $vs_sql_where = "WHERE representation_id >= ?"; $va_params[] = $vn_start; if ($vn_end) { $vs_sql_where .= " AND representation_id <= ?"; $va_params[] = $vn_end; } } } if ($vs_sql_where) { $vs_sql_where .= " AND mimetype = 'application/pdf'"; } else { $vs_sql_where = " WHERE mimetype = 'application/pdf'"; } $qr_reps = $o_db->query("\n\t\t\t\t\tSELECT * \n\t\t\t\t\tFROM ca_object_representations \n\t\t\t\t\t{$vs_sql_where}\n\t\t\t\t\tORDER BY representation_id\n\t\t\t\t", $va_params); print CLIProgressBar::start($qr_reps->numRows(), _t('Reindexing PDF representations')); $vn_rep_table_num = $t_rep->tableNum(); while ($qr_reps->nextRow()) { $va_media_info = $qr_reps->getMediaInfo('media'); $vs_original_filename = $va_media_info['ORIGINAL_FILENAME']; print CLIProgressBar::next(1, _t("Reindexing PDF %1", $vs_original_filename ? $vs_original_filename . " (" . $qr_reps->get('representation_id') . ")" : $qr_reps->get('representation_id'))); $t_rep->load($qr_reps->get('representation_id')); $vn_rep_id = $t_rep->getPrimaryKey(); $m = new Media(); if ($m->read($vs_path = $t_rep->getMediaPath('media', 'original')) && is_array($va_locs = $m->getExtractedTextLocations())) { MediaContentLocationIndexer::clear($vn_rep_table_num, $vn_rep_id); foreach ($va_locs as $vs_content => $va_loc_list) { foreach ($va_loc_list as $va_loc) { MediaContentLocationIndexer::index($vn_rep_table_num, $vn_rep_id, $vs_content, $va_loc['p'], $va_loc['x1'], $va_loc['y1'], $va_loc['x2'], $va_loc['y2']); } } MediaContentLocationIndexer::write(); } else { //CLIUtils::addError(_t("[Warning] No content to reindex for PDF representation: %1", $vs_path)); } } print CLIProgressBar::finish(); } if (in_array('all', $va_kinds) || in_array('ca_attributes', $va_kinds)) { // get all Media elements $va_elements = ca_metadata_elements::getElementsAsList(false, null, null, true, false, true, array(16)); // 16=media $qr_c = $o_db->query("\n\t\t\t\t\tSELECT count(*) c \n\t\t\t\t\tFROM ca_attribute_values\n\t\t\t\t\tWHERE\n\t\t\t\t\t\telement_id in (?)\n\t\t\t\t", caExtractValuesFromArrayList($va_elements, 'element_id', array('preserveKeys' => false))); if ($qr_c->nextRow()) { $vn_count = $qr_c->get('c'); } else { $vn_count = 0; } $t_attr_val = new ca_attribute_values(); $vn_attr_table_num = $t_attr_val->tableNum(); print CLIProgressBar::start($vn_count, _t('Reindexing metadata attribute media')); foreach ($va_elements as $vs_element_code => $va_element_info) { $qr_vals = $o_db->query("SELECT value_id FROM ca_attribute_values WHERE element_id = ?", (int) $va_element_info['element_id']); $va_vals = $qr_vals->getAllFieldValues('value_id'); foreach ($va_vals as $vn_value_id) { $t_attr_val = new ca_attribute_values($vn_value_id); if ($t_attr_val->getPrimaryKey()) { $t_attr_val->setMode(ACCESS_WRITE); $t_attr_val->useBlobAsMediaField(true); $va_media_info = $t_attr_val->getMediaInfo('value_blob'); $vs_original_filename = $va_media_info['ORIGINAL_FILENAME']; if (!is_array($va_media_info) || $va_media_info['MIMETYPE'] !== 'application/pdf') { continue; } print CLIProgressBar::next(1, _t("Reindexing %1", $vs_original_filename ? $vs_original_filename . " ({$vn_value_id})" : $vn_value_id)); $m = new Media(); if ($m->read($vs_path = $t_attr_val->getMediaPath('value_blob', 'original')) && is_array($va_locs = $m->getExtractedTextLocations())) { MediaContentLocationIndexer::clear($vn_attr_table_num, $vn_attr_table_num); foreach ($va_locs as $vs_content => $va_loc_list) { foreach ($va_loc_list as $va_loc) { MediaContentLocationIndexer::index($vn_attr_table_num, $vn_value_id, $vs_content, $va_loc['p'], $va_loc['x1'], $va_loc['y1'], $va_loc['x2'], $va_loc['y2']); } } MediaContentLocationIndexer::write(); } else { //CLIUtils::addError(_t("[Warning] No content to reindex for PDF in metadata attribute: %1", $vs_path)); } } } } print CLIProgressBar::finish(); } return true; }
/** * */ public static function check_media_fixity($po_opts = null) { require_once __CA_LIB_DIR__ . "/core/Db.php"; require_once __CA_MODELS_DIR__ . "/ca_object_representations.php"; $ps_file_path = strtolower((string) $po_opts->getOption('file')); $ps_format = strtolower((string) $po_opts->getOption('format')); if (!in_array($ps_format, array('text', 'tab', 'csv'))) { $ps_format = 'text'; } $o_db = new Db(); $o_dm = Datamodel::load(); $t_rep = new ca_object_representations(); $vs_report_output = join($ps_format == 'tab' ? "\t" : ",", array(_t('Type'), _t('Error'), _t('Name'), _t('ID'), _t('Version'), _t('File path'), _t('Expected MD5'), _t('Actual MD5'))) . "\n"; // Verify object representations $qr_reps = $o_db->query("SELECT representation_id, idno, media FROM ca_object_representations WHERE deleted = 0"); print CLIProgressBar::start($vn_rep_count = $qr_reps->numRows(), _t('Checking object representations')) . "\n"; $vn_errors = 0; while ($qr_reps->nextRow()) { $vn_representation_id = $qr_reps->get('representation_id'); print CLIProgressBar::next(1, _t("Checking representation media %1", $vn_representation_id)); $va_media_versions = $qr_reps->getMediaVersions('media'); foreach ($va_media_versions as $vs_version) { $vs_path = $qr_reps->getMediaPath('media', $vs_version); $vs_database_md5 = $qr_reps->getMediaInfo('media', $vs_version, 'MD5'); $vs_file_md5 = md5_file($vs_path); if ($vs_database_md5 !== $vs_file_md5) { $t_rep->load($vn_representation_id); $vs_message = _t("[Object representation][MD5 mismatch] %1; version %2 [%3]", $t_rep->get("ca_objects.preferred_labels.name") . " (" . $t_rep->get("ca_objects.idno") . "); representation_id={$vn_representation_id}", $vs_version, $vs_path); switch ($ps_format) { case 'text': default: $vs_report_output .= "{$vs_message}\n"; break; case 'tab': case 'csv': $va_log = array(_t('Object representation'), "MD5 mismatch", caEscapeForDelimitedOutput($t_rep->get("ca_objects.preferred_labels.name") . " (" . $t_rep->get("ca_objects.idno") . ")"), $vn_representation_id, $vs_version, $vs_path, $vs_database_md5, $vs_file_md5); $vs_report_output .= join($ps_format == 'tab' ? "\t" : ",", $va_log) . "\n"; break; } CLIUtils::addError($vs_message); $vn_errors++; } } } print CLIProgressBar::finish(); CLIUtils::addMessage(_t('%1 errors for %2 representations', $vn_errors, $vn_rep_count)); // get all Media elements $va_elements = ca_metadata_elements::getElementsAsList(false, null, null, true, false, true, array(16)); // 16=media if (is_array($va_elements) && sizeof($va_elements)) { if (is_array($va_element_ids = caExtractValuesFromArrayList($va_elements, 'element_id', array('preserveKeys' => false))) && sizeof($va_element_ids)) { $qr_c = $o_db->query("\n\t\t\t\t\t\tSELECT count(*) c\n\t\t\t\t\t\tFROM ca_attribute_values\n\t\t\t\t\t\tWHERE\n\t\t\t\t\t\t\telement_id in (?)\n\t\t\t\t\t", array($va_element_ids)); if ($qr_c->nextRow()) { $vn_count = $qr_c->get('c'); } else { $vn_count = 0; } print CLIProgressBar::start($vn_count, _t('Checking attribute media')); $vn_errors = 0; foreach ($va_elements as $vs_element_code => $va_element_info) { $qr_vals = $o_db->query("SELECT value_id FROM ca_attribute_values WHERE element_id = ?", (int) $va_element_info['element_id']); $va_vals = $qr_vals->getAllFieldValues('value_id'); foreach ($va_vals as $vn_value_id) { $t_attr_val = new ca_attribute_values($vn_value_id); if ($t_attr_val->getPrimaryKey()) { $t_attr_val->setMode(ACCESS_WRITE); $t_attr_val->useBlobAsMediaField(true); print CLIProgressBar::next(1, _t("Checking attribute media %1", $vn_value_id)); $va_media_versions = $t_attr_val->getMediaVersions('value_blob'); foreach ($va_media_versions as $vs_version) { $vs_path = $t_attr_val->getMediaPath('value_blob', $vs_version); $vs_database_md5 = $t_attr_val->getMediaInfo('value_blob', $vs_version, 'MD5'); $vs_file_md5 = md5_file($vs_path); if ($vs_database_md5 !== $vs_file_md5) { $t_attr = new ca_attributes($vn_attribute_id = $t_attr_val->get('attribute_id')); $vs_label = "attribute_id={$vn_attribute_id}; value_id={$vn_value_id}"; if ($t_instance = $o_dm->getInstanceByTableNum($t_attr->get('table_num'), true)) { if ($t_instance->load($t_attr->get('row_id'))) { $vs_label = $t_instance->get($t_instance->tableName() . '.preferred_labels'); if ($vs_idno = $t_instance->get($t_instance->getProperty('ID_NUMBERING_ID_FIELD'))) { $vs_label .= " ({$vs_label})"; } } } $vs_message = _t("[Media attribute][MD5 mismatch] %1; value_id=%2; version %3 [%4]", $vs_label, $vn_value_id, $vs_version, $vs_path); switch ($ps_format) { case 'text': default: $vs_report_output .= "{$vs_message}\n"; break; case 'tab': case 'csv': $va_log = array(_t('Media attribute'), _t("MD5 mismatch"), caEscapeForDelimitedOutput($vs_label), $vn_value_id, $vs_version, $vs_path, $vs_database_md5, $vs_file_md5); $vs_report_output .= join($ps_format == 'tab' ? "\t" : ",", $va_log); break; } CLIUtils::addError($vs_message); $vn_errors++; } } } } } print CLIProgressBar::finish(); CLIUtils::addMessage(_t('%1 errors for %2 attributes', $vn_errors, $vn_rep_count)); } } // get all File elements $va_elements = ca_metadata_elements::getElementsAsList(false, null, null, true, false, true, array(15)); // 15=file if (is_array($va_elements) && sizeof($va_elements)) { if (is_array($va_element_ids = caExtractValuesFromArrayList($va_elements, 'element_id', array('preserveKeys' => false))) && sizeof($va_element_ids)) { $qr_c = $o_db->query("\n\t\t\t\t\t\tSELECT count(*) c\n\t\t\t\t\t\tFROM ca_attribute_values\n\t\t\t\t\t\tWHERE\n\t\t\t\t\t\t\telement_id in (?)\n\t\t\t\t\t", array($va_element_ids)); if ($qr_c->nextRow()) { $vn_count = $qr_c->get('c'); } else { $vn_count = 0; } print CLIProgressBar::start($vn_count, _t('Checking attribute files')); $vn_errors = 0; foreach ($va_elements as $vs_element_code => $va_element_info) { $qr_vals = $o_db->query("SELECT value_id FROM ca_attribute_values WHERE element_id = ?", (int) $va_element_info['element_id']); $va_vals = $qr_vals->getAllFieldValues('value_id'); foreach ($va_vals as $vn_value_id) { $t_attr_val = new ca_attribute_values($vn_value_id); if ($t_attr_val->getPrimaryKey()) { $t_attr_val->setMode(ACCESS_WRITE); $t_attr_val->useBlobAsFileField(true); print CLIProgressBar::next(1, _t("Checking attribute file %1", $vn_value_id)); $vs_path = $t_attr_val->getFilePath('value_blob'); $vs_database_md5 = $t_attr_val->getFileInfo('value_blob', 'MD5'); $vs_file_md5 = md5_file($vs_path); if ($vs_database_md5 !== $vs_file_md5) { $t_attr = new ca_attributes($vn_attribute_id = $t_attr_val->get('attribute_id')); $vs_label = "attribute_id={$vn_attribute_id}; value_id={$vn_value_id}"; if ($t_instance = $o_dm->getInstanceByTableNum($t_attr->get('table_num'), true)) { if ($t_instance->load($t_attr->get('row_id'))) { $vs_label = $t_instance->get($t_instance->tableName() . '.preferred_labels'); if ($vs_idno = $t_instance->get($t_instance->getProperty('ID_NUMBERING_ID_FIELD'))) { $vs_label .= " ({$vs_label})"; } } } $vs_message = _t("[File attribute][MD5 mismatch] %1; value_id=%2; version %3 [%4]", $vs_label, $vn_value_id, $vs_version, $vs_path); switch ($ps_format) { case 'text': default: $vs_report_output .= "{$vs_message}\n"; break; case 'tab': case 'csv': $va_log = array(_t('File attribute'), _t("MD5 mismatch"), caEscapeForDelimitedOutput($vs_label), $vn_value_id, $vs_version, $vs_path, $vs_database_md5, $vs_file_md5); $vs_report_output .= join($ps_format == 'tab' ? "\t" : ",", $va_log); break; } CLIUtils::addError($vs_message); $vn_errors++; } } } } print CLIProgressBar::finish(); CLIUtils::addMessage(_t('%1 errors for %2 attributes', $vn_errors, $vn_rep_count)); } } if ($ps_file_path) { file_put_contents($ps_file_path, $vs_report_output); } return true; }
/** * */ public function editAttribute($pa_values, $pa_options = null) { if (!$this->getPrimaryKey()) { return null; } $vb_already_in_transaction = $this->inTransaction(); if (!$vb_already_in_transaction) { $o_trans = new Transaction(); $this->setTransaction($o_trans); } else { $o_trans = $this->getTransaction(); } unset(ca_attributes::$s_get_attributes_cache[$this->get('table_num') . '/' . $this->get('row_id')]); $this->setMode(ACCESS_WRITE); $this->set('locale_id', $pa_values['locale_id']); $this->update(); if ($this->numErrors()) { if (!$vb_already_in_transaction) { $o_trans->rollback(); } $vs_errors = join('; ', $this->getErrors()); $this->clearErrors(); $this->postError(1971, $vs_errors, 'ca_attributes->editAttribute()'); return false; } $t_attr_val = new ca_attribute_values(); $t_attr_val->purify($this->purify()); $t_attr_val->setTransaction($o_trans); $t_attr_val->setMode(ACCESS_WRITE); $t_element = ca_attributes::getElementInstance($this->get('element_id')); $va_elements = $t_element->getElementsInSet(); $va_attr_vals = $this->getAttributeValues(); foreach ($va_attr_vals as $o_attr_val) { $vn_element_id = intval($o_attr_val->getElementID()); if ($t_attr_val->load($o_attr_val->getValueID())) { if (isset($pa_values[$vn_element_id])) { $vm_value = $pa_values[$vn_element_id]; } else { $vm_value = $pa_values[$o_attr_val->getElementCode()]; } if ($t_attr_val->editValue($vm_value, $pa_options) === false) { $this->postError(1973, join('; ', $t_attr_val->getErrors()), 'ca_attributes->editAttribute()'); } foreach ($va_elements as $vn_i => $va_element_info) { if ($va_element_info['element_id'] == $vn_element_id) { unset($va_elements[$vn_i]); } } } } $vn_attribute_id = $this->getPrimaryKey(); // Add values that don't already exist (added after the fact?) foreach ($va_elements as $vn_index => $va_element) { if ($va_element['datatype'] == 0) { continue; } // skip containers $vn_element_id = $va_element['element_id']; if (isset($pa_values[$vn_element_id])) { $vm_value = $pa_values[$vn_element_id]; } else { $vm_value = $pa_values[$va_element['element_code']]; } if ($t_attr_val->addValue($vm_value, $va_element, $vn_attribute_id, $pa_options) === false) { $this->postError(1972, join('; ', $t_attr_val->getErrors()), 'ca_attributes->editAttribute()'); break; } } if ($this->numErrors()) { if (!$vb_already_in_transaction) { $o_trans->rollback(); } return false; } if (!$vb_already_in_transaction) { $o_trans->commit(); } return true; }
/** * Reprocess media */ public static function reprocess_media($po_opts = null) { require_once __CA_LIB_DIR__ . "/core/Db.php"; require_once __CA_MODELS_DIR__ . "/ca_object_representations.php"; $o_db = new Db(); $t_rep = new ca_object_representations(); $t_rep->setMode(ACCESS_WRITE); $va_mimetypes = ($vs_mimetypes = $po_opts->getOption("mimetypes")) ? explode(",", $vs_mimetypes) : array(); $va_versions = ($vs_versions = $po_opts->getOption("versions")) ? explode(",", $vs_versions) : array(); $va_kinds = ($vs_kinds = $po_opts->getOption("kinds")) ? explode(",", $vs_kinds) : array(); if (!is_array($va_kinds) || !sizeof($va_kinds)) { $va_kinds = array('all'); } $va_kinds = array_map('strtolower', $va_kinds); if (in_array('all', $va_kinds) || in_array('ca_object_representations', $va_kinds)) { if (!($vn_start = (int) $po_opts->getOption('start_id'))) { $vn_start = null; } if (!($vn_end = (int) $po_opts->getOption('end_id'))) { $vn_end = null; } if ($vn_id = (int) $po_opts->getOption('id')) { $vn_start = $vn_id; $vn_end = $vn_id; } $va_ids = array(); if ($vs_ids = (string) $po_opts->getOption('ids')) { if (sizeof($va_tmp = explode(",", $vs_ids))) { foreach ($va_tmp as $vn_id) { if ((int) $vn_id > 0) { $va_ids[] = (int) $vn_id; } } } } $vs_sql_where = null; $va_params = array(); if (sizeof($va_ids)) { $vs_sql_where = "WHERE representation_id IN (?)"; $va_params[] = $va_ids; } else { if ($vn_start > 0 && $vn_end > 0 && $vn_start <= $vn_end || $vn_start > 0 && $vn_end == null) { $vs_sql_where = "WHERE representation_id >= ?"; $va_params[] = $vn_start; if ($vn_end) { $vs_sql_where .= " AND representation_id <= ?"; $va_params[] = $vn_end; } } } $qr_reps = $o_db->query("\n\t\t\t\t\tSELECT * \n\t\t\t\t\tFROM ca_object_representations \n\t\t\t\t\t{$vs_sql_where}\n\t\t\t\t\tORDER BY representation_id\n\t\t\t\t", $va_params); print CLIProgressBar::start($qr_reps->numRows(), _t('Re-processing representation media')); while ($qr_reps->nextRow()) { $va_media_info = $qr_reps->getMediaInfo('media'); $vs_original_filename = $va_media_info['ORIGINAL_FILENAME']; print CLIProgressBar::next(1, _t("Re-processing %1", $vs_original_filename ? $vs_original_filename . " (" . $qr_reps->get('representation_id') . ")" : $qr_reps->get('representation_id'))); $vs_mimetype = $qr_reps->getMediaInfo('media', 'original', 'MIMETYPE'); if (sizeof($va_mimetypes)) { foreach ($va_mimetypes as $vs_mimetype_pattern) { if (!preg_match("!^{$vs_mimetype_pattern}!", $vs_mimetype)) { continue 2; } } } $t_rep->load($qr_reps->get('representation_id')); $t_rep->set('media', $qr_reps->getMediaPath('media', 'original'), array('original_filename' => $vs_original_filename)); if (sizeof($va_versions)) { $t_rep->update(array('updateOnlyMediaVersions' => $va_versions)); } else { $t_rep->update(); } if ($t_rep->numErrors()) { CLIUtils::addError(_t("Error processing representation media: %1", join('; ', $t_rep->getErrors()))); } } print CLIProgressBar::finish(); } if (in_array('all', $va_kinds) || in_array('ca_attributes', $va_kinds)) { // get all Media elements $va_elements = ca_metadata_elements::getElementsAsList(false, null, null, true, false, true, array(16)); // 16=media $qr_c = $o_db->query("\n\t\t\t\t\tSELECT count(*) c \n\t\t\t\t\tFROM ca_attribute_values\n\t\t\t\t\tWHERE\n\t\t\t\t\t\telement_id in (?)\n\t\t\t\t", caExtractValuesFromArrayList($va_elements, 'element_id', array('preserveKeys' => false))); if ($qr_c->nextRow()) { $vn_count = $qr_c->get('c'); } else { $vn_count = 0; } print CLIProgressBar::start($vn_count, _t('Re-processing attribute media')); foreach ($va_elements as $vs_element_code => $va_element_info) { $qr_vals = $o_db->query("SELECT value_id FROM ca_attribute_values WHERE element_id = ?", (int) $va_element_info['element_id']); $va_vals = $qr_vals->getAllFieldValues('value_id'); foreach ($va_vals as $vn_value_id) { $t_attr_val = new ca_attribute_values($vn_value_id); if ($t_attr_val->getPrimaryKey()) { $t_attr_val->setMode(ACCESS_WRITE); $t_attr_val->useBlobAsMediaField(true); $va_media_info = $t_attr_val->getMediaInfo('value_blob'); $vs_original_filename = is_array($va_media_info) ? $va_media_info['ORIGINAL_FILENAME'] : ''; print CLIProgressBar::next(1, _t("Re-processing %1", $vs_original_filename ? $vs_original_filename . " ({$vn_value_id})" : $vn_value_id)); $t_attr_val->set('value_blob', $t_attr_val->getMediaPath('value_blob', 'original'), array('original_filename' => $vs_original_filename)); $t_attr_val->update(); if ($t_attr_val->numErrors()) { CLIUtils::addError(_t("Error processing attribute media: %1", join('; ', $t_attr_val->getErrors()))); } } } } print CLIProgressBar::finish(); } return true; }