PHP TYPO3\CMS\IndexedSearch Indexer::convertHTMLToUtf8 예제들

프로그래밍 언어: PHP
네임스페이스/패키지 이름: TYPO3\CMS\IndexedSearch
클래스/타입: Indexer
메소드/함수: convertHTMLToUtf8
hotexamples.com에서의 예제들: 1
PHP TYPO3\CMS\IndexedSearch Indexer::convertHTMLToUtf8 - 1개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 PHP의 TYPO3\CMS\IndexedSearch\Indexer::convertHTMLToUtf8에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.
자주 사용되는 메소드들
보기 숨기기
metaphone(4)
extractHyperLinks(3)
convertHTMLToUtf8(1)
extractBaseHref(1)
isTableUsed(1)
log_setTSlogMessage(1)
md5inthash(1)
splitHTMLContent(1)
splitRegularContent(1)
typoSearchTags(1)
예제 #1
파일 보기
파일: FileContentParser.php 프로젝트: rickymathew/TYPO3.CMS
 /**
  * Reads the content of an external file being indexed.
  *
  * @param string $ext File extension, eg. "pdf", "doc" etc.
  * @param string $absFile Absolute filename of file (must exist and be validated OK before calling function)
  * @param string $cPKey Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be split.)
  * @return array Standard content array (title, description, keywords, body keys)
  */
 public function readFileContent($ext, $absFile, $cPKey)
 {
     $contentArr = null;
     // Return immediately if initialization didn't set support up:
     if (!$this->supportedExtensions[$ext]) {
         return false;
     }
     // Switch by file extension
     switch ($ext) {
         case 'pdf':
             if ($this->app['pdfinfo']) {
                 $this->setLocaleForServerFileSystem();
                 // Getting pdf-info:
                 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
                 CommandUtility::exec($cmd, $res);
                 $pdfInfo = $this->splitPdfInfo($res);
                 unset($res);
                 if ((int) $pdfInfo['pages']) {
                     list($low, $high) = explode('-', $cPKey);
                     // Get pdf content:
                     $tempFileName = GeneralUtility::tempnam('Typo3_indexer');
                     // Create temporary name
                     @unlink($tempFileName);
                     // Delete if exists, just to be safe.
                     $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
                     CommandUtility::exec($cmd);
                     if (@is_file($tempFileName)) {
                         $content = GeneralUtility::getUrl($tempFileName);
                         unlink($tempFileName);
                     } else {
                         $content = '';
                         $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsFailed'), $absFile), 2);
                     }
                     if ((string) $content !== '') {
                         $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
                     }
                 }
                 if (!empty($pdfInfo['title'])) {
                     $contentArr['title'] = $pdfInfo['title'];
                 }
                 $this->setLocaleForServerFileSystem(true);
             }
             break;
         case 'doc':
             if ($this->app['catdoc']) {
                 $this->setLocaleForServerFileSystem();
                 $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
                 CommandUtility::exec($cmd, $res);
                 $content = implode(LF, $res);
                 unset($res);
                 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
                 $this->setLocaleForServerFileSystem(true);
             }
             break;
         case 'pps':
         case 'ppt':
             if ($this->app['ppthtml']) {
                 $this->setLocaleForServerFileSystem();
                 $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
                 CommandUtility::exec($cmd, $res);
                 $content = implode(LF, $res);
                 unset($res);
                 $content = $this->pObj->convertHTMLToUtf8($content);
                 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
                 $contentArr['title'] = basename($absFile);
                 $this->setLocaleForServerFileSystem(true);
             }
             break;
         case 'xls':
             if ($this->app['xlhtml']) {
                 $this->setLocaleForServerFileSystem();
                 $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
                 CommandUtility::exec($cmd, $res);
                 $content = implode(LF, $res);
                 unset($res);
                 $content = $this->pObj->convertHTMLToUtf8($content);
                 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
                 $contentArr['title'] = basename($absFile);
                 $this->setLocaleForServerFileSystem(true);
             }
             break;
         case 'docx':
         case 'dotx':
         case 'pptx':
         case 'ppsx':
         case 'potx':
         case 'xlsx':
         case 'xltx':
             if ($this->app['unzip']) {
                 $this->setLocaleForServerFileSystem();
                 switch ($ext) {
                     case 'docx':
                     case 'dotx':
                         // Read document.xml:
                         $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' word/document.xml';
                         break;
                     case 'ppsx':
                     case 'pptx':
                     case 'potx':
                         // Read slide1.xml:
                         $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ppt/slides/slide1.xml';
                         break;
                     case 'xlsx':
                     case 'xltx':
                         // Read sheet1.xml:
                         $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml';
                         break;
                 }
                 CommandUtility::exec($cmd, $res);
                 $content_xml = implode(LF, $res);
                 unset($res);
                 $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
                 $contentArr = $this->pObj->splitRegularContent($utf8_content);
                 // Make sure the title doesn't expose the absolute path!
                 $contentArr['title'] = basename($absFile);
                 // Meta information
                 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' docProps/core.xml';
                 CommandUtility::exec($cmd, $res);
                 $meta_xml = implode(LF, $res);
                 unset($res);
                 $metaContent = GeneralUtility::xml2tree($meta_xml);
                 if (is_array($metaContent)) {
                     $contentArr['title'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:title'][0]['values'][0];
                     $contentArr['description'] = $metaContent['cp:coreProperties'][0]['ch']['dc:subject'][0]['values'][0];
                     $contentArr['description'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:description'][0]['values'][0];
                     $contentArr['keywords'] = $metaContent['cp:coreProperties'][0]['ch']['cp:keywords'][0]['values'][0];
                 }
                 $this->setLocaleForServerFileSystem(true);
             }
             break;
         case 'sxi':
         case 'sxc':
         case 'sxw':
         case 'ods':
         case 'odp':
         case 'odt':
             if ($this->app['unzip']) {
                 $this->setLocaleForServerFileSystem();
                 // Read content.xml:
                 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
                 CommandUtility::exec($cmd, $res);
                 $content_xml = implode(LF, $res);
                 unset($res);
                 // Read meta.xml:
                 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
                 CommandUtility::exec($cmd, $res);
                 $meta_xml = implode(LF, $res);
                 unset($res);
                 $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
                 $contentArr = $this->pObj->splitRegularContent($utf8_content);
                 $contentArr['title'] = basename($absFile);
                 // Make sure the title doesn't expose the absolute path!
                 // Meta information
                 $metaContent = GeneralUtility::xml2tree($meta_xml);
                 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
                 if (is_array($metaContent)) {
                     $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
                     $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0];
                     // Keywords collected:
                     if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
                         foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
                             $contentArr['keywords'] .= $kwDat['values'][0] . ' ';
                         }
                     }
                 }
                 $this->setLocaleForServerFileSystem(true);
             }
             break;
         case 'rtf':
             if ($this->app['unrtf']) {
                 $this->setLocaleForServerFileSystem();
                 $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
                 CommandUtility::exec($cmd, $res);
                 $fileContent = implode(LF, $res);
                 unset($res);
                 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
                 $contentArr = $this->pObj->splitHTMLContent($fileContent);
                 $this->setLocaleForServerFileSystem(true);
             }
             break;
         case 'txt':
         case 'csv':
             $this->setLocaleForServerFileSystem();
             // Raw text
             $content = GeneralUtility::getUrl($absFile);
             // @todo Implement auto detection of charset (currently assuming utf-8)
             $contentCharset = 'utf-8';
             $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
             $contentArr = $this->pObj->splitRegularContent($content);
             $contentArr['title'] = basename($absFile);
             // Make sure the title doesn't expose the absolute path!
             $this->setLocaleForServerFileSystem(true);
             break;
         case 'html':
         case 'htm':
             $fileContent = GeneralUtility::getUrl($absFile);
             $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
             $contentArr = $this->pObj->splitHTMLContent($fileContent);
             break;
         case 'xml':
             $this->setLocaleForServerFileSystem();
             // PHP strip-tags()
             $fileContent = GeneralUtility::getUrl($absFile);
             // Finding charset:
             preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
             $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
             // Converting content:
             $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
             $contentArr = $this->pObj->splitRegularContent($fileContent);
             $contentArr['title'] = basename($absFile);
             // Make sure the title doesn't expose the absolute path!
             $this->setLocaleForServerFileSystem(true);
             break;
         case 'jpg':
         case 'jpeg':
         case 'tif':
             $this->setLocaleForServerFileSystem();
             // PHP EXIF
             if (function_exists('exif_read_data')) {
                 $exif = @exif_read_data($absFile, 'IFD0');
             } else {
                 $exif = false;
             }
             if ($exif) {
                 $comment = trim($exif['COMMENT'][0] . ' ' . $exif['ImageDescription']);
             } else {
                 $comment = '';
             }
             $contentArr = $this->pObj->splitRegularContent($comment);
             $contentArr['title'] = basename($absFile);
             // Make sure the title doesn't expose the absolute path!
             $this->setLocaleForServerFileSystem(true);
             break;
         default:
             return false;
     }
     // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
     if (is_array($contentArr) && !$contentArr['title']) {
         // Substituting "_" for " " because many filenames may have this instead of a space char.
         $contentArr['title'] = str_replace('_', ' ', basename($absFile));
     }
     return $contentArr;
 }