/** * Process a document - extract text and index it * Refactored from indexDocuments() * * @param unknown_type $docinfo */ public function processDocument($document, $docinfo) { global $default; static $extractorCache = array(); // increment indexed documents count Indexer::incrementCount(); // if document is a zero byte file, let's just unqueue and return if ($document->getFileSize() == 0) { Indexer::unqueueDocument($docinfo['document_id'], sprintf(_kt("Zero Byte documents do not need to be indexed: %d"), $docinfo['document_id'])); return; } $docId = $docinfo['document_id']; $extension = $docinfo['filetypes']; $mimeType = $docinfo['mimetypes']; $extractorClass = $docinfo['extractor']; $indexDocument = in_array($docinfo['what'], array('A', 'C')); $indexDiscussion = in_array($docinfo['what'], array('A', 'D')); $this->indexingHistory = ''; $tempPath = $this->tempPath; $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension, $mimeType, $extractorClass), 'debug'); if (empty($extractorClass)) { /* if no extractor is found and we don't need to index discussions, then we can remove the item from the queue. */ if ($indexDiscussion) { $indexDocument = false; $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info'); } else { Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"), $docId)); return; } } else { /* If an extractor is available, we must ensure it is enabled. */ if (!$this->isExtractorEnabled($extractorClass)) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info'); return; } } if ($this->debug) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"), $docId), 'info'); } if ($this->restartCurrentBatch) { Indexer::unqueueDocument($docId); Indexer::index($docId, 'A'); return; } $filename = $document->getFileName(); if (substr($filename, 0, 1) == '~' || substr($filename, -1) == '~') { Indexer::unqueueDocument($docId, sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."), $docId), 'error'); return; } $removeFromQueue = true; if ($indexDocument) { if (array_key_exists($extractorClass, $extractorCache)) { $extractor = $extractorCache[$extractorClass]; } else { $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass); } if (!$extractor instanceof DocumentExtractor) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."), $extractorClass), 'error'); return; } $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber(); $sourceFile = $this->storageManager->temporaryFile($document); if (empty($sourceFile) || !is_file($sourceFile)) { Indexer::unqueueDocument($docId, sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."), $sourceFile, $docId), 'error'); continue; } if ($extractor->needsIntermediateSourceFile()) { //$extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION); $intermediate = $tempPath . '/' . $docId . '.' . $extension; $result = @copy($sourceFile, $intermediate); if ($result === false) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"), $docId), 'error'); // problem. lets try again later. probably permission related. log the issue. continue; } $sourceFile = $intermediate; } $extractor->setSourceFile($sourceFile); $extractor->setMimeType($mimeType); $extractor->setExtension($extension); $extractor->setDocument($document); $extractor->setIndexingStatus(null); $extractor->setExtractionStatus(null); $targetFile = tempnam($tempPath, 'ktindexer'); $extractor->setTargetFile($targetFile); $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"), $docId, $sourceFile, $targetFile), 'debug'); $this->executeHook($extractor, 'pre_extract'); $this->executeHook($extractor, 'pre_extract', $mimeType); $removeFromQueue = false; if ($extractor->extractTextContent()) { // the extractor may need to create another target file $targetFile = $extractor->getTargetFile(); $extractor->setExtractionStatus(true); $this->executeHook($extractor, 'pre_index'); $this->executeHook($extractor, 'pre_index', $mimeType); $title = $document->getName(); if ($indexDiscussion) { if (!$this->filterText($targetFile)) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"), $docId), 'error'); } else { $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version); $removeFromQueue = $indexStatus; if (!$indexStatus) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"), $docId), 'error'); } $extractor->setIndexingStatus($indexStatus); } } else { if (!$this->filterText($targetFile)) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"), $docId), 'error'); } else { $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version); $removeFromQueue = $indexStatus; if (!$indexStatus) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"), $docId), 'error'); $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error'); } $extractor->setIndexingStatus($indexStatus); } } $this->executeHook($extractor, 'post_index', $mimeType); $this->executeHook($extractor, 'post_index'); } else { $extractor->setExtractionStatus(false); $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"), $docId), 'error'); $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error'); } $this->executeHook($extractor, 'post_extract', $mimeType); $this->executeHook($extractor, 'post_extract'); if ($extractor->needsIntermediateSourceFile()) { @unlink($sourceFile); } @unlink($targetFile); } else { $indexStatus = $this->indexDiscussion($docId); $removeFromQueue = $indexStatus; } if ($removeFromQueue) { Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"), $docId)); } else { if ($this->debug) { $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"), $docId)); } } }