/**
  * Convert binary files to text and ensure the charset is UTF8
  *
  * @param object $file moodle storedfile
  * @return content or false
  */
 protected function get_clear_utf8_content($file)
 {
     $localewincharset = get_string('localewincharset', 'langconfig');
     $filen = $file->get_filename();
     $file_type = strtolower(substr($filen, strlen($filen) - 4, 4));
     if (array_search($file_type, array('.pdf', '.rtf', '.odt', '.doc', 'docx'))) {
         $temp_file = $this->tempdir . "/{$filen}.tmp";
         $file->copy_content_to($temp_file);
         switch ($file_type) {
             case '.pdf':
                 $content = pdf2text($temp_file);
                 break;
             case '.rtf':
                 $content = textlib_get_instance()->entities_to_utf8(rtf2text($temp_file));
                 break;
             case '.odt':
                 $content = getTextFromZippedXML($temp_file, 'content.xml');
                 break;
             case '.doc':
                 $antiwordpath = $this->get_config('antiwordpath');
                 $magic = file_get_contents($temp_file, NULL, NULL, -1, 2);
                 if ($magic === 'PK') {
                     // It is really a docx
                     $content = getTextFromZippedXML($temp_file, 'word/document.xml');
                 } else {
                     if (empty($antiwordpath) || !is_executable($antiwordpath)) {
                         $content = textlib_get_instance()->entities_to_utf8(doc2text($temp_file));
                     } else {
                         $content = shell_exec($antiwordpath . ' -f -w 0 ' . escapeshellarg($temp_file));
                         if (empty($content)) {
                             // antiword can not recognize this file
                             $content = textlib_get_instance()->entities_to_utf8(doc2text($temp_file));
                         }
                     }
                 }
                 break;
             case 'docx':
                 $content = getTextFromZippedXML($temp_file, 'word/document.xml');
                 break;
         }
         unlink($temp_file);
         return $this->wordwrap($content, 80);
     }
     // Files no need to covert format go here
     $content = $file->get_content();
     if (!mb_check_encoding($content, 'UTF-8')) {
         if (mb_check_encoding($content, $localewincharset)) {
             // Convert content charset to UTF-8
             $content = textlib_get_instance()->convert($content, $localewincharset);
         } else {
             // Unknown charset, possible binary file. Skip it
             mtrace("\tSkip unknown charset/binary file " . $file->get_filepath() . $file->get_filename());
             return false;
         }
     }
     return $content;
 }
function tokenizer($path, $extension)
{
    global $CFG;
    if (is_readable($path)) {
        // USE extension to choose tokenizer
        //	$path_parts = pathinfo($path);
        //	switch (strtolower($path_parts['extension'])):
        switch (strtolower($extension)) {
            case "pdf":
                $result = pdf2text($path);
                return $result;
            case "doc":
                $result = html_entity_decode(doc2text($path), null, 'UTF-8');
                return $result;
            case "docx":
                $result = getTextFromZippedXML($path, "word/document.xml");
                return $result;
            case "odt":
                $result = getTextFromZippedXML($path, "content.xml");
                return $result;
            case "rtf":
                $result = rtf2text($path);
                return $result;
            case "txt":
                return file_get_contents($path);
            case "cpp":
                return file_get_contents($path);
            case "java":
                return file_get_contents($path);
            default:
                return "unknown file type";
        }
    }
}
Example #3
0
/**
 * Given the path to an rtf document returns a lucene document with $filename and $contents set appropriately.
 * @param $path
 * @return Zend_Search_Lucene_Document
 */
function createRTFDocument($path)
{
    $doc = new Zend_Search_Lucene_Document();
    $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
    $contents = rtf2text($path);
    //print_r($contents);
    $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', $contents));
    return $doc;
}