function do_main() { $doc = $this->oDocument; $docid = $doc->getId(); if (Permission::userIsSystemAdministrator()) { $full_path = $doc->getFullPath(); if (Indexer::isDocumentScheduled($docid)) { Indexer::unqueueDocument($docid); $this->addInfoMessage(sprintf(_kt("Document '%s' has been removed from the indexing queue."), $full_path)); } else { Indexer::index($doc, 'A'); $this->addInfoMessage(sprintf(_kt("Document '%s' has been added to the indexing queue."), $full_path)); } } redirect("view.php?fDocumentId={$docid}"); exit; }
/** * Process a document - extract text and index it * Refactored from indexDocuments() * * @param unknown_type $docinfo */ public function processDocument($document, $docinfo) { global $default; static $extractorCache = array(); // increment indexed documents count Indexer::incrementCount(); // if document is a zero byte file, let's just unqueue and return if ($document->getFileSize() == 0) { Indexer::unqueueDocument($docinfo['document_id'], sprintf(_kt("Zero Byte documents do not need to be indexed: %d"), $docinfo['document_id'])); return; } $docId = $docinfo['document_id']; $extension = $docinfo['filetypes']; $mimeType = $docinfo['mimetypes']; $extractorClass = $docinfo['extractor']; $indexDocument = in_array($docinfo['what'], array('A', 'C')); $indexDiscussion = in_array($docinfo['what'], array('A', 'D')); $this->indexingHistory = ''; $tempPath = $this->tempPath; $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension, $mimeType, $extractorClass), 'debug'); if (empty($extractorClass)) { /* if no extractor is found and we don't need to index discussions, then we can remove the item from the queue. */ if ($indexDiscussion) { $indexDocument = false; $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info'); } else { Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"), $docId)); return; } } else { /* If an extractor is available, we must ensure it is enabled. */ if (!$this->isExtractorEnabled($extractorClass)) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info'); return; } } if ($this->debug) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"), $docId), 'info'); } if ($this->restartCurrentBatch) { Indexer::unqueueDocument($docId); Indexer::index($docId, 'A'); return; } $filename = $document->getFileName(); if (substr($filename, 0, 1) == '~' || substr($filename, -1) == '~') { Indexer::unqueueDocument($docId, sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."), $docId), 'error'); return; } $removeFromQueue = true; if ($indexDocument) { if (array_key_exists($extractorClass, $extractorCache)) { $extractor = $extractorCache[$extractorClass]; } else { $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass); } if (!$extractor instanceof DocumentExtractor) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."), $extractorClass), 'error'); return; } $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber(); $sourceFile = $this->storageManager->temporaryFile($document); if (empty($sourceFile) || !is_file($sourceFile)) { Indexer::unqueueDocument($docId, sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."), $sourceFile, $docId), 'error'); continue; } if ($extractor->needsIntermediateSourceFile()) { //$extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION); $intermediate = $tempPath . '/' . $docId . '.' . $extension; $result = @copy($sourceFile, $intermediate); if ($result === false) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"), $docId), 'error'); // problem. lets try again later. probably permission related. log the issue. continue; } $sourceFile = $intermediate; } $extractor->setSourceFile($sourceFile); $extractor->setMimeType($mimeType); $extractor->setExtension($extension); $extractor->setDocument($document); $extractor->setIndexingStatus(null); $extractor->setExtractionStatus(null); $targetFile = tempnam($tempPath, 'ktindexer'); $extractor->setTargetFile($targetFile); $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"), $docId, $sourceFile, $targetFile), 'debug'); $this->executeHook($extractor, 'pre_extract'); $this->executeHook($extractor, 'pre_extract', $mimeType); $removeFromQueue = false; if ($extractor->extractTextContent()) { // the extractor may need to create another target file $targetFile = $extractor->getTargetFile(); $extractor->setExtractionStatus(true); $this->executeHook($extractor, 'pre_index'); $this->executeHook($extractor, 'pre_index', $mimeType); $title = $document->getName(); if ($indexDiscussion) { if (!$this->filterText($targetFile)) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"), $docId), 'error'); } else { $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version); $removeFromQueue = $indexStatus; if (!$indexStatus) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"), $docId), 'error'); } $extractor->setIndexingStatus($indexStatus); } } else { if (!$this->filterText($targetFile)) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"), $docId), 'error'); } else { $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version); $removeFromQueue = $indexStatus; if (!$indexStatus) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"), $docId), 'error'); $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error'); } $extractor->setIndexingStatus($indexStatus); } } $this->executeHook($extractor, 'post_index', $mimeType); $this->executeHook($extractor, 'post_index'); } else { $extractor->setExtractionStatus(false); $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"), $docId), 'error'); $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error'); } $this->executeHook($extractor, 'post_extract', $mimeType); $this->executeHook($extractor, 'post_extract'); if ($extractor->needsIntermediateSourceFile()) { @unlink($sourceFile); } @unlink($targetFile); } else { $indexStatus = $this->indexDiscussion($docId); $removeFromQueue = $indexStatus; } if ($removeFromQueue) { Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"), $docId)); } else { if ($this->debug) { $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"), $docId)); } } }
public function extractTextContent() { global $default; $docId = $this->document->getId(); if (empty($this->extension)) { $default->log->info("DocumentId: {$docId} - Document does not have an extension"); Indexer::unqueueDocument($docId, sprintf("Removing document from queue: documentId %d", $docId)); return false; } // Open Office does not support the following files if (in_array($this->extension, array('xlt'))) { $default->log->info("DocumentId: {$docId} - Open Office does not support .xlt."); Indexer::unqueueDocument($docId, sprintf("Removing document from queue - Open Office does not support .xlt: documentId %d", $docId)); return false; } if (false === parent::extractTextContent()) { if (strpos($this->output, 'OpenOffice process not found or not listening') !== false) { $indexer = Indexer::get(); $indexer->restartBatch(); return false; } elseif (strpos($this->output, 'Unexpected connection closure') !== false || strpos($this->output, '\'NoneType\' object has no attribute \'storeToURL\'') !== false || strpos($this->output, 'The document could not be opened for conversion. This could indicate an unsupported mimetype.') !== false || strpos($this->output, 'URL seems to be an unsupported one.') !== false || strpos($this->output, '__main__.com.sun.star.task.ErrorCodeIOException') !== false) { $default->log->info("DocumentId: {$docId} - Suspect the file cannot be indexed by Open Office."); file_put_contents($this->targetfile, ''); $indexer = Indexer::get(); $indexer->restartBatch(); Indexer::unqueueDocument($docId, sprintf(_kt("Removing document from queue: documentId %d"), $docId)); return true; } return false; } if ($this->targetExtension != 'html') { file_put_contents($this->targetfile, ''); return true; } $content = file_get_contents($this->targetfile); $this->setTargetFile($this->targetfile . '.txt'); $content = $this->filter($content); if (empty($content)) { return touch($this->targetfile); } return file_put_contents($this->targetfile, $content); }
public function processQueue() { global $default; $default->log->debug('documentProcessor: starting'); // Check for lock file to ensure processor is not currently running $cacheDir = $default->cacheDirectory; $lockFile = $cacheDir . DIRECTORY_SEPARATOR . 'document_processor.lock'; if (file_exists($lockFile)) { // lock file exists, exit $default->log->debug('documentProcessor: stopping, lock file in place ' . $lockFile); return; } if ($default->enableIndexing) { // Setup indexing - load extractors, run diagnostics if ($this->indexer->preIndexingSetup() === false) { $default->log->debug('documentProcessor: stopping - indexer setup failed.'); return; } } // Get document queue $queue = $this->indexer->getDocumentsQueue($this->limit); if (empty($queue)) { $default->log->debug('documentProcessor: stopping - no documents in processing queue'); return; } // indexing starting - create lock file touch($lockFile); // Process queue foreach ($queue as $item) { // Get the document object $document = Document::get($item['document_id']); if (PEAR::isError($document)) { Indexer::unqueueDocument($docId, sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."), $docId, $document->getMessage()), 'error'); continue; } // index document if ($default->enableIndexing) { $this->indexer->processDocument($document, $item); } // loop through processors if ($this->processors !== false) { foreach ($this->processors as $processor) { $default->log->debug('documentProcessor: running processor: ' . $processor->getNamespace()); // Check document mime type against supported types if (!$this->isSupportedMimeType($item['mimetypes'], $processor->getSupportedMimeTypes())) { $default->log->debug('documentProcessor: not a supported mimetype: ' . $item['mimetypes']); continue; } // Process document $processor->setDocument($document); $processor->processDocument(); } } } // update the indexer statistics $this->indexer->updateIndexStats(); // Remove lock file to indicate processing has completed if (file_exists($lockFile)) { @unlink($lockFile); } $default->log->debug('documentProcessor: stopping'); }
function do_main() { //Number of items on a page $itemsPerPage = 50; $pageNum = 1; if (isset($_REQUEST['itemsPerPage'])) { $itemsPerPage = $_REQUEST['itemsPerPage']; } //registerTypes registers the mime types and populates the needed tables. $indexer = Indexer::get(); $indexer->registerTypes(); switch ($_REQUEST['rescheduleValue']) { case 'reschedule': foreach (KTUtil::arrayGet($_REQUEST, 'index_error', array()) as $sDocId => $v) { Indexer::reindexDocument($sDocId); } break; case 'remove': foreach (KTUtil::arrayGet($_REQUEST, 'index_error', array()) as $sDocId => $v) { Indexer::unqueueDocument($sDocId, 'Document removed from queue via admin interface. Normally this is because an indexer is not able to process the document.'); } break; case 'rescheduleall': $aIndexerValues = Indexer::getIndexingQueue(); foreach ($aIndexerValues as $sDocValues) { Indexer::reindexDocument($sDocValues['document_id']); } break; case 'removeall': $aIndexerValues = Indexer::getIndexingQueue(); foreach ($aIndexerValues as $sDocValues) { Indexer::unqueueDocument($sDocValues['document_id'], 'Document removed from queue via admin interface. Normally this is because an indexer is not able to process the document.'); } break; } $oTemplating =& KTTemplating::getSingleton(); $oTemplate =& $oTemplating->loadTemplate('ktcore/search2/reporting/indexerrors'); $aIndexerValues = Indexer::getIndexingQueue(); foreach ($aIndexerValues as $key => $doc) { $extractor = $indexer->getExtractor($doc['extractor']); if (is_null($extractor)) { $doc['extractor'] = 'n/a'; continue; } $doc['extractor'] = $extractor->getDisplayName(); $aIndexerValues[$key] = $doc; } $aIndexList = array(); //creating page variables and loading the items for the current page if (!empty($aIndexerValues)) { $items = count($aIndexerValues); if (fmod($items, $itemsPerPage) > 0) { $pages = floor($items / $itemsPerPage) + 1; } else { $pages = $items / $itemsPerPage; } for ($i = 1; $i <= $pages; $i++) { $aPages[] = $i; } if ($items < $itemsPerPage) { $limit = $items - 1; } else { $limit = $itemsPerPage - 1; } if (isset($_REQUEST['pageValue'])) { $pageNum = (int) $_REQUEST['pageValue']; if ($pageNum > $pages) { $pageNum = $pages; } $start = ($pageNum - 1) * $itemsPerPage - 1; $limit = $start + $itemsPerPage; for ($i = $start; $i <= $limit; $i++) { if (isset($aIndexerValues[$i])) { $aIndexList[] = $aIndexerValues[$i]; } } } else { for ($i = 0; $i <= $limit; $i++) { $aIndexList[] = $aIndexerValues[$i]; } } } $config = KTConfig::getSingleton(); $rootUrl = $config->get('KnowledgeTree/rootUrl'); $oTemplate->setData(array('context' => $this, 'pageList' => $aPages, 'pageCount' => $pages, 'pageNum' => $pageNum, 'itemCount' => $items, 'itemsPerPage' => $itemsPerPage, 'indexErrors' => $aIndexList, 'root_url' => $rootUrl)); return $oTemplate; }
/** * Fetch the documents in the indexing queue and start the indexer * */ public function processIndexQueue() { global $default; if (!$default->enableIndexing) { $default->log->debug('documentProcessor: indexer disabled'); return; } $default->log->debug('documentProcessor: starting indexer'); // Check for lock file to ensure processor is not currently running $cacheDir = $default->cacheDirectory; $lockFile = $cacheDir . DIRECTORY_SEPARATOR . 'document_processor.lock'; if (file_exists($lockFile)) { // If something causes the document processor to stop part way through processing, the lock // file will remain stopping the document processor from resuming. To workaround this problem // we check the creation date of the lockfile and remove it if it is older than 24 hours or // 48 hours if the batch size is greater than 1000 documents. $stat = stat($lockFile); $created = $stat['mtime']; $gap = 24; if ($this->limit > 1000) { $gap = 48; $default->log->warn('documentProcessor: batch size of documents to index is set to ' . $this->limit . ', this could cause problems.'); } $check = time() - $gap * 60 * 60; if ($check > $created) { $default->log->error('documentProcessor: lock file is older than ' . $gap . ' hours, deleting it to restart indexing - ' . $lockFile); @unlink($lockFile); } else { // lock file exists, exit // through a warning if the lock file is older than half an hour $small_gap = time() - 30 * 60; if ($small_gap > $created) { $default->log->warn('documentProcessor: stopping, lock file in place since ' . date('Y-m-d H:i:s', $created) . ' - ' . $lockFile); } return; } } // Setup indexing - load extractors, run diagnostics if ($this->indexer->preIndexingSetup() === false) { $default->log->error('documentProcessor: stopping - indexer setup failed.'); return; } // Get document queue $queue = $this->indexer->getDocumentsQueue($this->limit); if (empty($queue)) { $default->log->debug('documentProcessor: stopping - no documents in indexing queue'); return; } // indexing starting - create lock file touch($lockFile); // Process queue foreach ($queue as $item) { // Get the document object $docId = $item['document_id']; $document = Document::get($docId); if (PEAR::isError($document)) { Indexer::unqueueDocument($docId, sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."), $docId, $document->getMessage()), 'error'); continue; } // index document $this->indexer->processDocument($document, $item); } // update the indexer statistics $this->indexer->updateIndexStats(); // Remove lock file to indicate processing has completed if (file_exists($lockFile)) { @unlink($lockFile); } $default->log->debug('documentProcessor: stopping indexer, batch completed'); }
public function processQueue() { global $default; $default->log->debug('documentProcessor: starting'); if ($default->enableIndexing) { // Setup indexing - load extractors, run diagnostics if ($this->indexer->preIndexingSetup() === false) { $default->log->debug('documentProcessor: stopping - indexer setup failed.'); return; } } // Get document queue $queue = $this->indexer->getDocumentsQueue($this->limit); if (empty($queue)) { $default->log->debug('documentProcessor: stopping - no documents in processing queue'); return; } // Process queue foreach ($queue as $item) { // Get the document object $document = Document::get($item['document_id']); if (PEAR::isError($document)) { Indexer::unqueueDocument($docId, sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."), $docId, $document->getMessage()), 'error'); continue; } // index document if ($default->enableIndexing) { $this->indexer->processDocument($document, $item); } // loop through processors if ($this->processors !== false) { foreach ($this->processors as $processor) { $default->log->debug('documentProcessor: running processor: ' . $processor->getNamespace()); // Check document mime type against supported types if (!$this->isSupportedMimeType($item['mimetypes'], $processor->getSupportedMimeTypes())) { $default->log->debug('documentProcessor: not a supported mimetype: ' . $item['mimetypes']); continue; } // Process document $processor->setDocument($document); $processor->processDocument(); } } } // update the indexer statistics $this->indexer->updateIndexStats(); $default->log->debug('documentProcessor: stopping'); }