示例#1
0
 /**
  * get_complete_data1 function.
  * Jetzt noch die verlinkten Dokumente auslesen
  * @access private
  * @return void
  */
 private function get_complete_data1($data_org = array(), $db)
 {
     foreach ($data_org as $key => $value) {
         if ($this->count >= MAX_DOK) {
             continue;
         }
         //Checken ob Dokument schon in DB, wenn ja weiter, wenn nein scannen und neu laden
         if ($this->check_ob_dokument_vorhanden($value['id'], $db) == true or $this->noch_nicht_vorhanden == true) {
             continue;
         } else {
             //Wenn x Dokumente erreicht sind
             if ($this->count > MAX_DOK) {
                 $this->noch_nicht_vorhanden = true;
             }
             $this->count = $this->count + 1;
         }
         $data[$key] = $value;
         //debug::print_d($value);
         //Zuerst das Dokument selber aber mit Fallunterscheidung
         if (!empty($value['id_link_html'])) {
             //Fall 1 . htm
             $dok = class_methods::get_site('http://www2.bonn.de/bo_ris/' . $value['id_link_html']);
             $row['html'] = $dok;
             $row['html_text'] = class_methods::get_clean_text($dok);
             $row['html_meta'] = $this->get_extra_infos_dokument($dok);
         }
         if (!empty($value['id_link_pdf'])) {
             $dok = class_methods::get_site('http://www2.bonn.de/bo_ris/' . $value['id_link_pdf']);
             $row['pdf'] = $dok;
             //PDF Daten auslesen wenn möglich / geht natürlich nicht bei Bildern usw.
             $convert = new pdf2txt();
             $result = $convert->convert($dok);
             $row['pfd_text'] = class_methods::get_clean_text($result);
         }
         if (!empty($value['id_link_rtf'])) {
             $dok = class_methods::get_site('http://www2.bonn.de/bo_ris/' . $value['id_link_rtf']);
             $row['sonstiges_dokument'] = $dok;
             $row['sonstiges_text'] = class_methods::get_clean_text($dok);
         }
         //Daten in Hauptarray übergeben
         $data[$key]['id_data'] = $row;
         //Dann die Metainformationen zum Dokument
         if (!empty($value['meta_link'])) {
             //echo 'http://www2.bonn.de/bo_ris/ris_sql/'.$value['meta_link'];
             $dok = class_methods::get_site('http://www2.bonn.de/bo_ris/ris_sql/' . $value['meta_link']);
             $row['meta_data'] = $dok;
             $row['meta_data_text'] = class_methods::get_clean_text($dok);
             $row['meta_data_extra'] = $this->get_extra_infos($dok);
             //Daten in Hauptarray übergeben
             $data[$key]['id_data'] = $row;
         }
         //$data[]=$value;
     }
     return $data;
     //print_r($data);
 }
示例#2
0
 /**
  * Indexes a certain file.
  * Returns A587_FILE_GENERATED or an error code.
  * 
  * @param string $_filename
  * @param mixed $_clang
  * @param mixed $_doPlaintext
  * @param mixed $_articleData
  * 
  * @return mixed
  */
 function indexFile($_filename, $_doPlaintext = false, $_clang = false, $_fid = false, $_catid = false)
 {
     // extract file-extension
     $filenameArray = explode('.', $_filename);
     $fileext = $filenameArray[count($filenameArray) - 1];
     // check file-extension
     if (!in_array($fileext, $this->fileExtensions) and !empty($this->fileExtensions) and !$this->indexUnknownFileExtensions and !$this->indexMissingFileExtensions) {
         return A587_FILE_FORBIDDEN_EXTENSION;
     }
     // delete cache
     $delete = new rex_sql();
     $where = sprintf(" `filename` = '%s' AND `texttype` = 'file'", $delete->escape($_filename));
     if (is_int($_clang)) {
         $where .= sprintf(' AND clang = %d', $_clang);
     }
     if (is_int($_fid)) {
         $where .= sprintf(' AND fid = %d', $_fid);
     } elseif (is_array($_fid)) {
         $where .= sprintf(" AND fid = '%s'", $delete->escape(json_encode($_fid)));
     }
     if (is_int($_catid)) {
         $where .= sprintf(' AND catid = %d', $_catid);
     }
     // delete from cache
     $select = new rex_sql();
     $select->setTable($this->tablePrefix . '587_searchindex');
     $select->setWhere($where);
     $indexIds = array();
     if ($select->select('id')) {
         foreach ($select->getArray() as $result) {
             $indexIds[] = $result['id'];
         }
         $this->deleteCache($indexIds);
     }
     // delete old data
     $delete->setTable($this->tablePrefix . '587_searchindex');
     $delete->setWhere($where);
     $delete->delete();
     // index file
     $text = '';
     $plaintext = '';
     switch ($fileext) {
         // pdf-files
         case 'pdf':
             // try XPDF
             $return = 0;
             $xpdf = false;
             $error = false;
             if (function_exists('exec')) {
                 $tempFile = tempnam($this->generatedPath . '/files/', 'rexsearch');
                 $encoding = 'UTF-8';
                 exec('pdftotext ' . escapeshellarg($this->documentRoot . '/' . $_filename) . ' ' . escapeshellarg($tempFile) . ' -enc ' . $encoding, $dummy, $return);
                 if ($return > 0) {
                     if ($return == 1) {
                         $error = A587_FILE_XPDFERR_OPENSRC;
                     }
                     if ($return == 2) {
                         $error = A587_FILE_XPDFERR_OPENDEST;
                     }
                     if ($return == 3) {
                         $error = A587_FILE_XPDFERR_PERM;
                     }
                     if ($return == 99) {
                         $error = A587_FILE_XPDFERR_OTHER;
                     }
                 } else {
                     if (false === ($text = @file_get_contents($tempFile))) {
                         $error = A587_FILE_NOEXIST;
                     } else {
                         $xpdf = true;
                     }
                 }
                 unlink($tempFile);
             }
             if (!$xpdf) {
                 // if xpdf returned an error, try pdf2txt via php
                 if (false === ($pdfContent = @file_get_contents($this->documentRoot . '/' . $_filename))) {
                     $error = A587_FILE_NOEXIST;
                 } else {
                     require_once 'class.pdf2txt.inc.php';
                     $text = pdf2txt::directConvert($pdfContent);
                     $error = false;
                 }
             }
             if ($error !== false) {
                 return $error;
             } elseif (trim($text) == '') {
                 return A587_FILE_EMPTY;
             }
             $plaintext = $this->getPlaintext($text);
             break;
             // html- or php-file
         // html- or php-file
         case 'htm':
         case 'html':
         case 'php':
             if (false === ($text = @file_get_contents($this->documentRoot . '/' . $_filename))) {
                 return A587_FILE_NOEXIST;
             }
             $plaintext = $this->getPlaintext($text);
             // other filetype
         // other filetype
         default:
             if (false === ($text = @file_get_contents($this->documentRoot . '/' . $_filename))) {
                 return A587_FILE_NOEXIST;
             }
     }
     $text = @iconv(mb_detect_encoding($text), 'UTF-8', $text);
     // Plaintext
     if (empty($plaintext)) {
         if ($_doPlaintext) {
             $plaintext = $this->getPlaintext($text);
         } else {
             $plaintext = $text;
         }
     }
     // index file-content
     $insert = new rex_sql();
     $fileData['texttype'] = 'file';
     if ($_fid !== false) {
         $fileData['ftable'] = $this->tablePrefix . 'file';
     }
     $fileData['filename'] = $insert->escape($_filename);
     $fileData['fileext'] = $insert->escape($fileext);
     if ($_clang !== false) {
         $fileData['clang'] = intval($_clang);
     }
     if ($_fid !== false) {
         $fileData['fid'] = intval($_fid);
     } else {
         $fileData['fid'] = NULL;
     }
     if (is_null($fileData['fid'])) {
         $fileData['fid'] = $this->getMinFID();
     }
     if ($_catid !== false) {
         $fileData['catid'] = intval($_catid);
     }
     $fileData['unchangedtext'] = $insert->escape($text);
     $fileData['plaintext'] = $insert->escape($plaintext);
     $keywords = array();
     foreach (preg_split($this->encodeRegex('~[[:punct:][:space:]]+~ism'), $plaintext) as $keyword) {
         if ($this->significantCharacterCount <= mb_strlen($keyword, 'UTF-8')) {
             $keywords[] = array('search' => $keyword, 'clang' => !isset($fileData['clang']) ? false : $fileData['clang']);
         }
     }
     $this->storeKeywords($keywords, false);
     $fileData['teaser'] = $insert->escape($this->getTeaserText($plaintext));
     $insert->setTable($this->tablePrefix . '587_searchindex');
     $insert->setValues($fileData);
     $insert->insert();
     return A587_FILE_GENERATED;
 }
示例#3
0
 function directConvert($_data)
 {
     $pdf2txt = new pdf2txt();
     return $pdf2txt->convert($_data);
 }