Ejemplo n.º 1
0
 /**
  * Process a document - extract text and index it
  * Refactored from indexDocuments()
  *
  * @param unknown_type $docinfo
  */
 public function processDocument($document, $docinfo)
 {
     global $default;
     static $extractorCache = array();
     // increment indexed documents count
     Indexer::incrementCount();
     // if document is a zero byte file, let's just unqueue and return
     if ($document->getFileSize() == 0) {
         Indexer::unqueueDocument($docinfo['document_id'], sprintf(_kt("Zero Byte documents do not need to be indexed: %d"), $docinfo['document_id']));
         return;
     }
     $docId = $docinfo['document_id'];
     $extension = $docinfo['filetypes'];
     $mimeType = $docinfo['mimetypes'];
     $extractorClass = $docinfo['extractor'];
     $indexDocument = in_array($docinfo['what'], array('A', 'C'));
     $indexDiscussion = in_array($docinfo['what'], array('A', 'D'));
     $this->indexingHistory = '';
     $tempPath = $this->tempPath;
     $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension, $mimeType, $extractorClass), 'debug');
     if (empty($extractorClass)) {
         /*
         if no extractor is found and we don't need to index discussions, then we can remove the item from the queue.
         */
         if ($indexDiscussion) {
             $indexDocument = false;
             $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info');
         } else {
             Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"), $docId));
             return;
         }
     } else {
         /*
         If an extractor is available, we must ensure it is enabled.
         */
         if (!$this->isExtractorEnabled($extractorClass)) {
             $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info');
             return;
         }
     }
     if ($this->debug) {
         $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"), $docId), 'info');
     }
     if ($this->restartCurrentBatch) {
         Indexer::unqueueDocument($docId);
         Indexer::index($docId, 'A');
         return;
     }
     $filename = $document->getFileName();
     if (substr($filename, 0, 1) == '~' || substr($filename, -1) == '~') {
         Indexer::unqueueDocument($docId, sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."), $docId), 'error');
         return;
     }
     $removeFromQueue = true;
     if ($indexDocument) {
         if (array_key_exists($extractorClass, $extractorCache)) {
             $extractor = $extractorCache[$extractorClass];
         } else {
             $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass);
         }
         if (!$extractor instanceof DocumentExtractor) {
             $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."), $extractorClass), 'error');
             return;
         }
         $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
         $sourceFile = $this->storageManager->temporaryFile($document);
         if (empty($sourceFile) || !is_file($sourceFile)) {
             Indexer::unqueueDocument($docId, sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."), $sourceFile, $docId), 'error');
             continue;
         }
         if ($extractor->needsIntermediateSourceFile()) {
             //$extension =  pathinfo($document->getFileName(), PATHINFO_EXTENSION);
             $intermediate = $tempPath . '/' . $docId . '.' . $extension;
             $result = @copy($sourceFile, $intermediate);
             if ($result === false) {
                 $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"), $docId), 'error');
                 // problem. lets try again later. probably permission related. log the issue.
                 continue;
             }
             $sourceFile = $intermediate;
         }
         $extractor->setSourceFile($sourceFile);
         $extractor->setMimeType($mimeType);
         $extractor->setExtension($extension);
         $extractor->setDocument($document);
         $extractor->setIndexingStatus(null);
         $extractor->setExtractionStatus(null);
         $targetFile = tempnam($tempPath, 'ktindexer');
         $extractor->setTargetFile($targetFile);
         $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"), $docId, $sourceFile, $targetFile), 'debug');
         $this->executeHook($extractor, 'pre_extract');
         $this->executeHook($extractor, 'pre_extract', $mimeType);
         $removeFromQueue = false;
         if ($extractor->extractTextContent()) {
             // the extractor may need to create another target file
             $targetFile = $extractor->getTargetFile();
             $extractor->setExtractionStatus(true);
             $this->executeHook($extractor, 'pre_index');
             $this->executeHook($extractor, 'pre_index', $mimeType);
             $title = $document->getName();
             if ($indexDiscussion) {
                 if (!$this->filterText($targetFile)) {
                     $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"), $docId), 'error');
                 } else {
                     $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
                     $removeFromQueue = $indexStatus;
                     if (!$indexStatus) {
                         $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"), $docId), 'error');
                     }
                     $extractor->setIndexingStatus($indexStatus);
                 }
             } else {
                 if (!$this->filterText($targetFile)) {
                     $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"), $docId), 'error');
                 } else {
                     $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);
                     $removeFromQueue = $indexStatus;
                     if (!$indexStatus) {
                         $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"), $docId), 'error');
                         $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
                     }
                     $extractor->setIndexingStatus($indexStatus);
                 }
             }
             $this->executeHook($extractor, 'post_index', $mimeType);
             $this->executeHook($extractor, 'post_index');
         } else {
             $extractor->setExtractionStatus(false);
             $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"), $docId), 'error');
             $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
         }
         $this->executeHook($extractor, 'post_extract', $mimeType);
         $this->executeHook($extractor, 'post_extract');
         if ($extractor->needsIntermediateSourceFile()) {
             @unlink($sourceFile);
         }
         @unlink($targetFile);
     } else {
         $indexStatus = $this->indexDiscussion($docId);
         $removeFromQueue = $indexStatus;
     }
     if ($removeFromQueue) {
         Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"), $docId));
     } else {
         if ($this->debug) {
             $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"), $docId));
         }
     }
 }