/** * Reads the content of an external file being indexed. * * @param string $ext File extension, eg. "pdf", "doc" etc. * @param string $absFile Absolute filename of file (must exist and be validated OK before calling function) * @param string $cPKey Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be split.) * @return array Standard content array (title, description, keywords, body keys) */ public function readFileContent($ext, $absFile, $cPKey) { $contentArr = null; // Return immediately if initialization didn't set support up: if (!$this->supportedExtensions[$ext]) { return false; } // Switch by file extension switch ($ext) { case 'pdf': if ($this->app['pdfinfo']) { $this->setLocaleForServerFileSystem(); // Getting pdf-info: $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile); CommandUtility::exec($cmd, $res); $pdfInfo = $this->splitPdfInfo($res); unset($res); if ((int) $pdfInfo['pages']) { list($low, $high) = explode('-', $cPKey); // Get pdf content: $tempFileName = GeneralUtility::tempnam('Typo3_indexer'); // Create temporary name @unlink($tempFileName); // Delete if exists, just to be safe. $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName; CommandUtility::exec($cmd); if (@is_file($tempFileName)) { $content = GeneralUtility::getUrl($tempFileName); unlink($tempFileName); } else { $content = ''; $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsFailed'), $absFile), 2); } if ((string) $content !== '') { $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content)); } } if (!empty($pdfInfo['title'])) { $contentArr['title'] = $pdfInfo['title']; } $this->setLocaleForServerFileSystem(true); } break; case 'doc': if ($this->app['catdoc']) { $this->setLocaleForServerFileSystem(); $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile); CommandUtility::exec($cmd, $res); $content = implode(LF, $res); unset($res); $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content)); $this->setLocaleForServerFileSystem(true); } break; case 'pps': case 'ppt': if ($this->app['ppthtml']) { $this->setLocaleForServerFileSystem(); $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile); CommandUtility::exec($cmd, $res); $content = implode(LF, $res); unset($res); $content = $this->pObj->convertHTMLToUtf8($content); $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content)); $contentArr['title'] = basename($absFile); $this->setLocaleForServerFileSystem(true); } break; case 'xls': if ($this->app['xlhtml']) { $this->setLocaleForServerFileSystem(); $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile); CommandUtility::exec($cmd, $res); $content = implode(LF, $res); unset($res); $content = $this->pObj->convertHTMLToUtf8($content); $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content)); $contentArr['title'] = basename($absFile); $this->setLocaleForServerFileSystem(true); } break; case 'docx': case 'dotx': case 'pptx': case 'ppsx': case 'potx': case 'xlsx': case 'xltx': if ($this->app['unzip']) { $this->setLocaleForServerFileSystem(); switch ($ext) { case 'docx': case 'dotx': // Read document.xml: $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' word/document.xml'; break; case 'ppsx': case 'pptx': case 'potx': // Read slide1.xml: $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ppt/slides/slide1.xml'; break; case 'xlsx': case 'xltx': // Read sheet1.xml: $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml'; break; } CommandUtility::exec($cmd, $res); $content_xml = implode(LF, $res); unset($res); $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml))); $contentArr = $this->pObj->splitRegularContent($utf8_content); // Make sure the title doesn't expose the absolute path! $contentArr['title'] = basename($absFile); // Meta information $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' docProps/core.xml'; CommandUtility::exec($cmd, $res); $meta_xml = implode(LF, $res); unset($res); $metaContent = GeneralUtility::xml2tree($meta_xml); if (is_array($metaContent)) { $contentArr['title'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:title'][0]['values'][0]; $contentArr['description'] = $metaContent['cp:coreProperties'][0]['ch']['dc:subject'][0]['values'][0]; $contentArr['description'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:description'][0]['values'][0]; $contentArr['keywords'] = $metaContent['cp:coreProperties'][0]['ch']['cp:keywords'][0]['values'][0]; } $this->setLocaleForServerFileSystem(true); } break; case 'sxi': case 'sxc': case 'sxw': case 'ods': case 'odp': case 'odt': if ($this->app['unzip']) { $this->setLocaleForServerFileSystem(); // Read content.xml: $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml'; CommandUtility::exec($cmd, $res); $content_xml = implode(LF, $res); unset($res); // Read meta.xml: $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml'; CommandUtility::exec($cmd, $res); $meta_xml = implode(LF, $res); unset($res); $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml))); $contentArr = $this->pObj->splitRegularContent($utf8_content); $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! // Meta information $metaContent = GeneralUtility::xml2tree($meta_xml); $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch']; if (is_array($metaContent)) { $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title']; $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0]; // Keywords collected: if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) { foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) { $contentArr['keywords'] .= $kwDat['values'][0] . ' '; } } } $this->setLocaleForServerFileSystem(true); } break; case 'rtf': if ($this->app['unrtf']) { $this->setLocaleForServerFileSystem(); $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile); CommandUtility::exec($cmd, $res); $fileContent = implode(LF, $res); unset($res); $fileContent = $this->pObj->convertHTMLToUtf8($fileContent); $contentArr = $this->pObj->splitHTMLContent($fileContent); $this->setLocaleForServerFileSystem(true); } break; case 'txt': case 'csv': $this->setLocaleForServerFileSystem(); // Raw text $content = GeneralUtility::getUrl($absFile); // @todo Implement auto detection of charset (currently assuming utf-8) $contentCharset = 'utf-8'; $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset); $contentArr = $this->pObj->splitRegularContent($content); $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! $this->setLocaleForServerFileSystem(true); break; case 'html': case 'htm': $fileContent = GeneralUtility::getUrl($absFile); $fileContent = $this->pObj->convertHTMLToUtf8($fileContent); $contentArr = $this->pObj->splitHTMLContent($fileContent); break; case 'xml': $this->setLocaleForServerFileSystem(); // PHP strip-tags() $fileContent = GeneralUtility::getUrl($absFile); // Finding charset: preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg); $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8'; // Converting content: $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset); $contentArr = $this->pObj->splitRegularContent($fileContent); $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! $this->setLocaleForServerFileSystem(true); break; case 'jpg': case 'jpeg': case 'tif': $this->setLocaleForServerFileSystem(); // PHP EXIF if (function_exists('exif_read_data')) { $exif = @exif_read_data($absFile, 'IFD0'); } else { $exif = false; } if ($exif) { $comment = trim($exif['COMMENT'][0] . ' ' . $exif['ImageDescription']); } else { $comment = ''; } $contentArr = $this->pObj->splitRegularContent($comment); $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! $this->setLocaleForServerFileSystem(true); break; default: return false; } // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name. if (is_array($contentArr) && !$contentArr['title']) { // Substituting "_" for " " because many filenames may have this instead of a space char. $contentArr['title'] = str_replace('_', ' ', basename($absFile)); } return $contentArr; }