public function extractTextContent() { global $default; $docId = $this->document->getId(); if (empty($this->extension)) { $default->log->info("DocumentId: {$docId} - Document does not have an extension"); Indexer::unqueueDocument($docId, sprintf("Removing document from queue: documentId %d", $docId)); return false; } // Open Office does not support the following files if (in_array($this->extension, array('xlt'))) { $default->log->info("DocumentId: {$docId} - Open Office does not support .xlt."); Indexer::unqueueDocument($docId, sprintf("Removing document from queue - Open Office does not support .xlt: documentId %d", $docId)); return false; } if (false === parent::extractTextContent()) { if (strpos($this->output, 'OpenOffice process not found or not listening') !== false) { $indexer = Indexer::get(); $indexer->restartBatch(); return false; } elseif (strpos($this->output, 'Unexpected connection closure') !== false || strpos($this->output, '\'NoneType\' object has no attribute \'storeToURL\'') !== false || strpos($this->output, 'The document could not be opened for conversion. This could indicate an unsupported mimetype.') !== false || strpos($this->output, 'URL seems to be an unsupported one.') !== false || strpos($this->output, '__main__.com.sun.star.task.ErrorCodeIOException') !== false) { $default->log->info("DocumentId: {$docId} - Suspect the file cannot be indexed by Open Office."); file_put_contents($this->targetfile, ''); $indexer = Indexer::get(); $indexer->restartBatch(); Indexer::unqueueDocument($docId, sprintf(_kt("Removing document from queue: documentId %d"), $docId)); return true; } return false; } if ($this->targetExtension != 'html') { file_put_contents($this->targetfile, ''); return true; } $content = file_get_contents($this->targetfile); $this->setTargetFile($this->targetfile . '.txt'); $content = $this->filter($content); if (empty($content)) { return touch($this->targetfile); } return file_put_contents($this->targetfile, $content); }
public function extractTextContent() { if ($this->cmd !== false) { // so we have catppt or something $result = parent::extractTextContent(); if ($result !== false) { // if it returns true, we can bail return true; } // if failure, fallthrough, and attempt OO } /* if ($this->useOO) { $this->oo->setSourceFile($this->sourcefile); $this->oo->setMimeType($this->mimetype); $this->oo->setExtension($this->extension); $this->oo->setTargetFile($this->targetfile); $this->oo->setDocument($this->document); $this->oo->setIndexingStatus(null); $this->oo->setExtractionStatus(null); $result = $this->oo->extractTextContent(); $this->setIndexingStatus($this->oo->getIndexingStatus()); $this->setExtractionStatus($this->oo->getExtractionStatus()); $this->setTargetFile($this->oo->getTargetFile()); return $result; } else { */ global $default; $docId = $this->document->getId(); $cmd = $this->cmd; $default->log->info("The document {$docId} cannot be indexed as {$cmd} is not available and OpenOffice is not in use."); file_put_contents($this->targetfile, ''); return true; //} }