/** * get_complete_data1 function. * Jetzt noch die verlinkten Dokumente auslesen * @access private * @return void */ private function get_complete_data1($data_org = array(), $db) { foreach ($data_org as $key => $value) { if ($this->count >= MAX_DOK) { continue; } //Checken ob Dokument schon in DB, wenn ja weiter, wenn nein scannen und neu laden if ($this->check_ob_dokument_vorhanden($value['id'], $db) == true or $this->noch_nicht_vorhanden == true) { continue; } else { //Wenn x Dokumente erreicht sind if ($this->count > MAX_DOK) { $this->noch_nicht_vorhanden = true; } $this->count = $this->count + 1; } $data[$key] = $value; //debug::print_d($value); //Zuerst das Dokument selber aber mit Fallunterscheidung if (!empty($value['id_link_html'])) { //Fall 1 . htm $dok = class_methods::get_site('http://www2.bonn.de/bo_ris/' . $value['id_link_html']); $row['html'] = $dok; $row['html_text'] = class_methods::get_clean_text($dok); $row['html_meta'] = $this->get_extra_infos_dokument($dok); } if (!empty($value['id_link_pdf'])) { $dok = class_methods::get_site('http://www2.bonn.de/bo_ris/' . $value['id_link_pdf']); $row['pdf'] = $dok; //PDF Daten auslesen wenn möglich / geht natürlich nicht bei Bildern usw. $convert = new pdf2txt(); $result = $convert->convert($dok); $row['pfd_text'] = class_methods::get_clean_text($result); } if (!empty($value['id_link_rtf'])) { $dok = class_methods::get_site('http://www2.bonn.de/bo_ris/' . $value['id_link_rtf']); $row['sonstiges_dokument'] = $dok; $row['sonstiges_text'] = class_methods::get_clean_text($dok); } //Daten in Hauptarray übergeben $data[$key]['id_data'] = $row; //Dann die Metainformationen zum Dokument if (!empty($value['meta_link'])) { //echo 'http://www2.bonn.de/bo_ris/ris_sql/'.$value['meta_link']; $dok = class_methods::get_site('http://www2.bonn.de/bo_ris/ris_sql/' . $value['meta_link']); $row['meta_data'] = $dok; $row['meta_data_text'] = class_methods::get_clean_text($dok); $row['meta_data_extra'] = $this->get_extra_infos($dok); //Daten in Hauptarray übergeben $data[$key]['id_data'] = $row; } //$data[]=$value; } return $data; //print_r($data); }
/** * Indexes a certain file. * Returns A587_FILE_GENERATED or an error code. * * @param string $_filename * @param mixed $_clang * @param mixed $_doPlaintext * @param mixed $_articleData * * @return mixed */ function indexFile($_filename, $_doPlaintext = false, $_clang = false, $_fid = false, $_catid = false) { // extract file-extension $filenameArray = explode('.', $_filename); $fileext = $filenameArray[count($filenameArray) - 1]; // check file-extension if (!in_array($fileext, $this->fileExtensions) and !empty($this->fileExtensions) and !$this->indexUnknownFileExtensions and !$this->indexMissingFileExtensions) { return A587_FILE_FORBIDDEN_EXTENSION; } // delete cache $delete = new rex_sql(); $where = sprintf(" `filename` = '%s' AND `texttype` = 'file'", $delete->escape($_filename)); if (is_int($_clang)) { $where .= sprintf(' AND clang = %d', $_clang); } if (is_int($_fid)) { $where .= sprintf(' AND fid = %d', $_fid); } elseif (is_array($_fid)) { $where .= sprintf(" AND fid = '%s'", $delete->escape(json_encode($_fid))); } if (is_int($_catid)) { $where .= sprintf(' AND catid = %d', $_catid); } // delete from cache $select = new rex_sql(); $select->setTable($this->tablePrefix . '587_searchindex'); $select->setWhere($where); $indexIds = array(); if ($select->select('id')) { foreach ($select->getArray() as $result) { $indexIds[] = $result['id']; } $this->deleteCache($indexIds); } // delete old data $delete->setTable($this->tablePrefix . '587_searchindex'); $delete->setWhere($where); $delete->delete(); // index file $text = ''; $plaintext = ''; switch ($fileext) { // pdf-files case 'pdf': // try XPDF $return = 0; $xpdf = false; $error = false; if (function_exists('exec')) { $tempFile = tempnam($this->generatedPath . '/files/', 'rexsearch'); $encoding = 'UTF-8'; exec('pdftotext ' . escapeshellarg($this->documentRoot . '/' . $_filename) . ' ' . escapeshellarg($tempFile) . ' -enc ' . $encoding, $dummy, $return); if ($return > 0) { if ($return == 1) { $error = A587_FILE_XPDFERR_OPENSRC; } if ($return == 2) { $error = A587_FILE_XPDFERR_OPENDEST; } if ($return == 3) { $error = A587_FILE_XPDFERR_PERM; } if ($return == 99) { $error = A587_FILE_XPDFERR_OTHER; } } else { if (false === ($text = @file_get_contents($tempFile))) { $error = A587_FILE_NOEXIST; } else { $xpdf = true; } } unlink($tempFile); } if (!$xpdf) { // if xpdf returned an error, try pdf2txt via php if (false === ($pdfContent = @file_get_contents($this->documentRoot . '/' . $_filename))) { $error = A587_FILE_NOEXIST; } else { require_once 'class.pdf2txt.inc.php'; $text = pdf2txt::directConvert($pdfContent); $error = false; } } if ($error !== false) { return $error; } elseif (trim($text) == '') { return A587_FILE_EMPTY; } $plaintext = $this->getPlaintext($text); break; // html- or php-file // html- or php-file case 'htm': case 'html': case 'php': if (false === ($text = @file_get_contents($this->documentRoot . '/' . $_filename))) { return A587_FILE_NOEXIST; } $plaintext = $this->getPlaintext($text); // other filetype // other filetype default: if (false === ($text = @file_get_contents($this->documentRoot . '/' . $_filename))) { return A587_FILE_NOEXIST; } } $text = @iconv(mb_detect_encoding($text), 'UTF-8', $text); // Plaintext if (empty($plaintext)) { if ($_doPlaintext) { $plaintext = $this->getPlaintext($text); } else { $plaintext = $text; } } // index file-content $insert = new rex_sql(); $fileData['texttype'] = 'file'; if ($_fid !== false) { $fileData['ftable'] = $this->tablePrefix . 'file'; } $fileData['filename'] = $insert->escape($_filename); $fileData['fileext'] = $insert->escape($fileext); if ($_clang !== false) { $fileData['clang'] = intval($_clang); } if ($_fid !== false) { $fileData['fid'] = intval($_fid); } else { $fileData['fid'] = NULL; } if (is_null($fileData['fid'])) { $fileData['fid'] = $this->getMinFID(); } if ($_catid !== false) { $fileData['catid'] = intval($_catid); } $fileData['unchangedtext'] = $insert->escape($text); $fileData['plaintext'] = $insert->escape($plaintext); $keywords = array(); foreach (preg_split($this->encodeRegex('~[[:punct:][:space:]]+~ism'), $plaintext) as $keyword) { if ($this->significantCharacterCount <= mb_strlen($keyword, 'UTF-8')) { $keywords[] = array('search' => $keyword, 'clang' => !isset($fileData['clang']) ? false : $fileData['clang']); } } $this->storeKeywords($keywords, false); $fileData['teaser'] = $insert->escape($this->getTeaserText($plaintext)); $insert->setTable($this->tablePrefix . '587_searchindex'); $insert->setValues($fileData); $insert->insert(); return A587_FILE_GENERATED; }
function directConvert($_data) { $pdf2txt = new pdf2txt(); return $pdf2txt->convert($_data); }