function do_main()
 {
     $doc = $this->oDocument;
     $docid = $doc->getId();
     if (Permission::userIsSystemAdministrator()) {
         $full_path = $doc->getFullPath();
         if (Indexer::isDocumentScheduled($docid)) {
             Indexer::unqueueDocument($docid);
             $this->addInfoMessage(sprintf(_kt("Document '%s' has been removed from the indexing queue."), $full_path));
         } else {
             Indexer::index($doc, 'A');
             $this->addInfoMessage(sprintf(_kt("Document '%s' has been added to the indexing queue."), $full_path));
         }
     }
     redirect("view.php?fDocumentId={$docid}");
     exit;
 }
Esempio n. 2
0
 /**
  * Process a document - extract text and index it
  * Refactored from indexDocuments()
  *
  * @param unknown_type $docinfo
  */
 public function processDocument($document, $docinfo)
 {
     global $default;
     static $extractorCache = array();
     // increment indexed documents count
     Indexer::incrementCount();
     // if document is a zero byte file, let's just unqueue and return
     if ($document->getFileSize() == 0) {
         Indexer::unqueueDocument($docinfo['document_id'], sprintf(_kt("Zero Byte documents do not need to be indexed: %d"), $docinfo['document_id']));
         return;
     }
     $docId = $docinfo['document_id'];
     $extension = $docinfo['filetypes'];
     $mimeType = $docinfo['mimetypes'];
     $extractorClass = $docinfo['extractor'];
     $indexDocument = in_array($docinfo['what'], array('A', 'C'));
     $indexDiscussion = in_array($docinfo['what'], array('A', 'D'));
     $this->indexingHistory = '';
     $tempPath = $this->tempPath;
     $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension, $mimeType, $extractorClass), 'debug');
     if (empty($extractorClass)) {
         /*
         if no extractor is found and we don't need to index discussions, then we can remove the item from the queue.
         */
         if ($indexDiscussion) {
             $indexDocument = false;
             $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info');
         } else {
             Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"), $docId));
             return;
         }
     } else {
         /*
         If an extractor is available, we must ensure it is enabled.
         */
         if (!$this->isExtractorEnabled($extractorClass)) {
             $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info');
             return;
         }
     }
     if ($this->debug) {
         $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"), $docId), 'info');
     }
     if ($this->restartCurrentBatch) {
         Indexer::unqueueDocument($docId);
         Indexer::index($docId, 'A');
         return;
     }
     $filename = $document->getFileName();
     if (substr($filename, 0, 1) == '~' || substr($filename, -1) == '~') {
         Indexer::unqueueDocument($docId, sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."), $docId), 'error');
         return;
     }
     $removeFromQueue = true;
     if ($indexDocument) {
         if (array_key_exists($extractorClass, $extractorCache)) {
             $extractor = $extractorCache[$extractorClass];
         } else {
             $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass);
         }
         if (!$extractor instanceof DocumentExtractor) {
             $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."), $extractorClass), 'error');
             return;
         }
         $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
         $sourceFile = $this->storageManager->temporaryFile($document);
         if (empty($sourceFile) || !is_file($sourceFile)) {
             Indexer::unqueueDocument($docId, sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."), $sourceFile, $docId), 'error');
             continue;
         }
         if ($extractor->needsIntermediateSourceFile()) {
             //$extension =  pathinfo($document->getFileName(), PATHINFO_EXTENSION);
             $intermediate = $tempPath . '/' . $docId . '.' . $extension;
             $result = @copy($sourceFile, $intermediate);
             if ($result === false) {
                 $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"), $docId), 'error');
                 // problem. lets try again later. probably permission related. log the issue.
                 continue;
             }
             $sourceFile = $intermediate;
         }
         $extractor->setSourceFile($sourceFile);
         $extractor->setMimeType($mimeType);
         $extractor->setExtension($extension);
         $extractor->setDocument($document);
         $extractor->setIndexingStatus(null);
         $extractor->setExtractionStatus(null);
         $targetFile = tempnam($tempPath, 'ktindexer');
         $extractor->setTargetFile($targetFile);
         $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"), $docId, $sourceFile, $targetFile), 'debug');
         $this->executeHook($extractor, 'pre_extract');
         $this->executeHook($extractor, 'pre_extract', $mimeType);
         $removeFromQueue = false;
         if ($extractor->extractTextContent()) {
             // the extractor may need to create another target file
             $targetFile = $extractor->getTargetFile();
             $extractor->setExtractionStatus(true);
             $this->executeHook($extractor, 'pre_index');
             $this->executeHook($extractor, 'pre_index', $mimeType);
             $title = $document->getName();
             if ($indexDiscussion) {
                 if (!$this->filterText($targetFile)) {
                     $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"), $docId), 'error');
                 } else {
                     $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
                     $removeFromQueue = $indexStatus;
                     if (!$indexStatus) {
                         $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"), $docId), 'error');
                     }
                     $extractor->setIndexingStatus($indexStatus);
                 }
             } else {
                 if (!$this->filterText($targetFile)) {
                     $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"), $docId), 'error');
                 } else {
                     $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);
                     $removeFromQueue = $indexStatus;
                     if (!$indexStatus) {
                         $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"), $docId), 'error');
                         $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
                     }
                     $extractor->setIndexingStatus($indexStatus);
                 }
             }
             $this->executeHook($extractor, 'post_index', $mimeType);
             $this->executeHook($extractor, 'post_index');
         } else {
             $extractor->setExtractionStatus(false);
             $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"), $docId), 'error');
             $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
         }
         $this->executeHook($extractor, 'post_extract', $mimeType);
         $this->executeHook($extractor, 'post_extract');
         if ($extractor->needsIntermediateSourceFile()) {
             @unlink($sourceFile);
         }
         @unlink($targetFile);
     } else {
         $indexStatus = $this->indexDiscussion($docId);
         $removeFromQueue = $indexStatus;
     }
     if ($removeFromQueue) {
         Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"), $docId));
     } else {
         if ($this->debug) {
             $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"), $docId));
         }
     }
 }
 public function extractTextContent()
 {
     global $default;
     $docId = $this->document->getId();
     if (empty($this->extension)) {
         $default->log->info("DocumentId: {$docId} - Document does not have an extension");
         Indexer::unqueueDocument($docId, sprintf("Removing document from queue: documentId %d", $docId));
         return false;
     }
     // Open Office does not support the following files
     if (in_array($this->extension, array('xlt'))) {
         $default->log->info("DocumentId: {$docId} - Open Office does not support .xlt.");
         Indexer::unqueueDocument($docId, sprintf("Removing document from queue - Open Office does not support .xlt: documentId %d", $docId));
         return false;
     }
     if (false === parent::extractTextContent()) {
         if (strpos($this->output, 'OpenOffice process not found or not listening') !== false) {
             $indexer = Indexer::get();
             $indexer->restartBatch();
             return false;
         } elseif (strpos($this->output, 'Unexpected connection closure') !== false || strpos($this->output, '\'NoneType\' object has no attribute \'storeToURL\'') !== false || strpos($this->output, 'The document could not be opened for conversion. This could indicate an unsupported mimetype.') !== false || strpos($this->output, 'URL seems to be an unsupported one.') !== false || strpos($this->output, '__main__.com.sun.star.task.ErrorCodeIOException') !== false) {
             $default->log->info("DocumentId: {$docId} - Suspect the file cannot be indexed by Open Office.");
             file_put_contents($this->targetfile, '');
             $indexer = Indexer::get();
             $indexer->restartBatch();
             Indexer::unqueueDocument($docId, sprintf(_kt("Removing document from queue: documentId %d"), $docId));
             return true;
         }
         return false;
     }
     if ($this->targetExtension != 'html') {
         file_put_contents($this->targetfile, '');
         return true;
     }
     $content = file_get_contents($this->targetfile);
     $this->setTargetFile($this->targetfile . '.txt');
     $content = $this->filter($content);
     if (empty($content)) {
         return touch($this->targetfile);
     }
     return file_put_contents($this->targetfile, $content);
 }
 public function processQueue()
 {
     global $default;
     $default->log->debug('documentProcessor: starting');
     // Check for lock file to ensure processor is not currently running
     $cacheDir = $default->cacheDirectory;
     $lockFile = $cacheDir . DIRECTORY_SEPARATOR . 'document_processor.lock';
     if (file_exists($lockFile)) {
         // lock file exists, exit
         $default->log->debug('documentProcessor: stopping, lock file in place ' . $lockFile);
         return;
     }
     if ($default->enableIndexing) {
         // Setup indexing - load extractors, run diagnostics
         if ($this->indexer->preIndexingSetup() === false) {
             $default->log->debug('documentProcessor: stopping - indexer setup failed.');
             return;
         }
     }
     // Get document queue
     $queue = $this->indexer->getDocumentsQueue($this->limit);
     if (empty($queue)) {
         $default->log->debug('documentProcessor: stopping - no documents in processing queue');
         return;
     }
     // indexing starting - create lock file
     touch($lockFile);
     // Process queue
     foreach ($queue as $item) {
         // Get the document object
         $document = Document::get($item['document_id']);
         if (PEAR::isError($document)) {
             Indexer::unqueueDocument($docId, sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."), $docId, $document->getMessage()), 'error');
             continue;
         }
         // index document
         if ($default->enableIndexing) {
             $this->indexer->processDocument($document, $item);
         }
         // loop through processors
         if ($this->processors !== false) {
             foreach ($this->processors as $processor) {
                 $default->log->debug('documentProcessor: running processor: ' . $processor->getNamespace());
                 // Check document mime type against supported types
                 if (!$this->isSupportedMimeType($item['mimetypes'], $processor->getSupportedMimeTypes())) {
                     $default->log->debug('documentProcessor: not a supported mimetype: ' . $item['mimetypes']);
                     continue;
                 }
                 // Process document
                 $processor->setDocument($document);
                 $processor->processDocument();
             }
         }
     }
     // update the indexer statistics
     $this->indexer->updateIndexStats();
     // Remove lock file to indicate processing has completed
     if (file_exists($lockFile)) {
         @unlink($lockFile);
     }
     $default->log->debug('documentProcessor: stopping');
 }
Esempio n. 5
0
 function do_main()
 {
     //Number of items on a page
     $itemsPerPage = 50;
     $pageNum = 1;
     if (isset($_REQUEST['itemsPerPage'])) {
         $itemsPerPage = $_REQUEST['itemsPerPage'];
     }
     //registerTypes registers the mime types and populates the needed tables.
     $indexer = Indexer::get();
     $indexer->registerTypes();
     switch ($_REQUEST['rescheduleValue']) {
         case 'reschedule':
             foreach (KTUtil::arrayGet($_REQUEST, 'index_error', array()) as $sDocId => $v) {
                 Indexer::reindexDocument($sDocId);
             }
             break;
         case 'remove':
             foreach (KTUtil::arrayGet($_REQUEST, 'index_error', array()) as $sDocId => $v) {
                 Indexer::unqueueDocument($sDocId, 'Document removed from queue via admin interface. Normally this is because an indexer is not able to process the document.');
             }
             break;
         case 'rescheduleall':
             $aIndexerValues = Indexer::getIndexingQueue();
             foreach ($aIndexerValues as $sDocValues) {
                 Indexer::reindexDocument($sDocValues['document_id']);
             }
             break;
         case 'removeall':
             $aIndexerValues = Indexer::getIndexingQueue();
             foreach ($aIndexerValues as $sDocValues) {
                 Indexer::unqueueDocument($sDocValues['document_id'], 'Document removed from queue via admin interface. Normally this is because an indexer is not able to process the document.');
             }
             break;
     }
     $oTemplating =& KTTemplating::getSingleton();
     $oTemplate =& $oTemplating->loadTemplate('ktcore/search2/reporting/indexerrors');
     $aIndexerValues = Indexer::getIndexingQueue();
     foreach ($aIndexerValues as $key => $doc) {
         $extractor = $indexer->getExtractor($doc['extractor']);
         if (is_null($extractor)) {
             $doc['extractor'] = 'n/a';
             continue;
         }
         $doc['extractor'] = $extractor->getDisplayName();
         $aIndexerValues[$key] = $doc;
     }
     $aIndexList = array();
     //creating page variables and loading the items for the current page
     if (!empty($aIndexerValues)) {
         $items = count($aIndexerValues);
         if (fmod($items, $itemsPerPage) > 0) {
             $pages = floor($items / $itemsPerPage) + 1;
         } else {
             $pages = $items / $itemsPerPage;
         }
         for ($i = 1; $i <= $pages; $i++) {
             $aPages[] = $i;
         }
         if ($items < $itemsPerPage) {
             $limit = $items - 1;
         } else {
             $limit = $itemsPerPage - 1;
         }
         if (isset($_REQUEST['pageValue'])) {
             $pageNum = (int) $_REQUEST['pageValue'];
             if ($pageNum > $pages) {
                 $pageNum = $pages;
             }
             $start = ($pageNum - 1) * $itemsPerPage - 1;
             $limit = $start + $itemsPerPage;
             for ($i = $start; $i <= $limit; $i++) {
                 if (isset($aIndexerValues[$i])) {
                     $aIndexList[] = $aIndexerValues[$i];
                 }
             }
         } else {
             for ($i = 0; $i <= $limit; $i++) {
                 $aIndexList[] = $aIndexerValues[$i];
             }
         }
     }
     $config = KTConfig::getSingleton();
     $rootUrl = $config->get('KnowledgeTree/rootUrl');
     $oTemplate->setData(array('context' => $this, 'pageList' => $aPages, 'pageCount' => $pages, 'pageNum' => $pageNum, 'itemCount' => $items, 'itemsPerPage' => $itemsPerPage, 'indexErrors' => $aIndexList, 'root_url' => $rootUrl));
     return $oTemplate;
 }
 /**
  * Fetch the documents in the indexing queue and start the indexer
  *
  */
 public function processIndexQueue()
 {
     global $default;
     if (!$default->enableIndexing) {
         $default->log->debug('documentProcessor: indexer disabled');
         return;
     }
     $default->log->debug('documentProcessor: starting indexer');
     // Check for lock file to ensure processor is not currently running
     $cacheDir = $default->cacheDirectory;
     $lockFile = $cacheDir . DIRECTORY_SEPARATOR . 'document_processor.lock';
     if (file_exists($lockFile)) {
         // If something causes the document processor to stop part way through processing, the lock
         // file will remain stopping the document processor from resuming. To workaround this problem
         // we check the creation date of the lockfile and remove it if it is older than 24 hours or
         // 48 hours if the batch size is greater than 1000 documents.
         $stat = stat($lockFile);
         $created = $stat['mtime'];
         $gap = 24;
         if ($this->limit > 1000) {
             $gap = 48;
             $default->log->warn('documentProcessor: batch size of documents to index is set to ' . $this->limit . ', this could cause problems.');
         }
         $check = time() - $gap * 60 * 60;
         if ($check > $created) {
             $default->log->error('documentProcessor: lock file is older than ' . $gap . ' hours, deleting it to restart indexing - ' . $lockFile);
             @unlink($lockFile);
         } else {
             // lock file exists, exit
             // through a warning if the lock file is older than half an hour
             $small_gap = time() - 30 * 60;
             if ($small_gap > $created) {
                 $default->log->warn('documentProcessor: stopping, lock file in place since ' . date('Y-m-d H:i:s', $created) . ' - ' . $lockFile);
             }
             return;
         }
     }
     // Setup indexing - load extractors, run diagnostics
     if ($this->indexer->preIndexingSetup() === false) {
         $default->log->error('documentProcessor: stopping - indexer setup failed.');
         return;
     }
     // Get document queue
     $queue = $this->indexer->getDocumentsQueue($this->limit);
     if (empty($queue)) {
         $default->log->debug('documentProcessor: stopping - no documents in indexing queue');
         return;
     }
     // indexing starting - create lock file
     touch($lockFile);
     // Process queue
     foreach ($queue as $item) {
         // Get the document object
         $docId = $item['document_id'];
         $document = Document::get($docId);
         if (PEAR::isError($document)) {
             Indexer::unqueueDocument($docId, sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."), $docId, $document->getMessage()), 'error');
             continue;
         }
         // index document
         $this->indexer->processDocument($document, $item);
     }
     // update the indexer statistics
     $this->indexer->updateIndexStats();
     // Remove lock file to indicate processing has completed
     if (file_exists($lockFile)) {
         @unlink($lockFile);
     }
     $default->log->debug('documentProcessor: stopping indexer, batch completed');
 }
 public function processQueue()
 {
     global $default;
     $default->log->debug('documentProcessor: starting');
     if ($default->enableIndexing) {
         // Setup indexing - load extractors, run diagnostics
         if ($this->indexer->preIndexingSetup() === false) {
             $default->log->debug('documentProcessor: stopping - indexer setup failed.');
             return;
         }
     }
     // Get document queue
     $queue = $this->indexer->getDocumentsQueue($this->limit);
     if (empty($queue)) {
         $default->log->debug('documentProcessor: stopping - no documents in processing queue');
         return;
     }
     // Process queue
     foreach ($queue as $item) {
         // Get the document object
         $document = Document::get($item['document_id']);
         if (PEAR::isError($document)) {
             Indexer::unqueueDocument($docId, sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."), $docId, $document->getMessage()), 'error');
             continue;
         }
         // index document
         if ($default->enableIndexing) {
             $this->indexer->processDocument($document, $item);
         }
         // loop through processors
         if ($this->processors !== false) {
             foreach ($this->processors as $processor) {
                 $default->log->debug('documentProcessor: running processor: ' . $processor->getNamespace());
                 // Check document mime type against supported types
                 if (!$this->isSupportedMimeType($item['mimetypes'], $processor->getSupportedMimeTypes())) {
                     $default->log->debug('documentProcessor: not a supported mimetype: ' . $item['mimetypes']);
                     continue;
                 }
                 // Process document
                 $processor->setDocument($document);
                 $processor->processDocument();
             }
         }
     }
     // update the indexer statistics
     $this->indexer->updateIndexStats();
     $default->log->debug('documentProcessor: stopping');
 }