예제 #1
1
 /**
  * Convert binary files to text and ensure the charset is UTF8
  *
  * @param object $file moodle storedfile
  * @return content or false
  */
 protected function get_clear_utf8_content($file)
 {
     $localewincharset = get_string('localewincharset', 'langconfig');
     $filen = $file->get_filename();
     $file_type = strtolower(substr($filen, strlen($filen) - 4, 4));
     if (array_search($file_type, array('.pdf', '.rtf', '.odt', '.doc', 'docx'))) {
         $temp_file = $this->tempdir . "/{$filen}.tmp";
         $file->copy_content_to($temp_file);
         switch ($file_type) {
             case '.pdf':
                 $content = pdf2text($temp_file);
                 break;
             case '.rtf':
                 $content = textlib_get_instance()->entities_to_utf8(rtf2text($temp_file));
                 break;
             case '.odt':
                 $content = getTextFromZippedXML($temp_file, 'content.xml');
                 break;
             case '.doc':
                 $antiwordpath = $this->get_config('antiwordpath');
                 $magic = file_get_contents($temp_file, NULL, NULL, -1, 2);
                 if ($magic === 'PK') {
                     // It is really a docx
                     $content = getTextFromZippedXML($temp_file, 'word/document.xml');
                 } else {
                     if (empty($antiwordpath) || !is_executable($antiwordpath)) {
                         $content = textlib_get_instance()->entities_to_utf8(doc2text($temp_file));
                     } else {
                         $content = shell_exec($antiwordpath . ' -f -w 0 ' . escapeshellarg($temp_file));
                         if (empty($content)) {
                             // antiword can not recognize this file
                             $content = textlib_get_instance()->entities_to_utf8(doc2text($temp_file));
                         }
                     }
                 }
                 break;
             case 'docx':
                 $content = getTextFromZippedXML($temp_file, 'word/document.xml');
                 break;
         }
         unlink($temp_file);
         return $this->wordwrap($content, 80);
     }
     // Files no need to covert format go here
     $content = $file->get_content();
     if (!mb_check_encoding($content, 'UTF-8')) {
         if (mb_check_encoding($content, $localewincharset)) {
             // Convert content charset to UTF-8
             $content = textlib_get_instance()->convert($content, $localewincharset);
         } else {
             // Unknown charset, possible binary file. Skip it
             mtrace("\tSkip unknown charset/binary file " . $file->get_filepath() . $file->get_filename());
             return false;
         }
     }
     return $content;
 }
예제 #2
0
function getremotecontent($url)
{
    global $CFG;
    $plagiarismsettings = (array) get_config('plagiarism');
    $file_size = $plagiarismsettings['crot_max_file_size'];
    // analyze the extension (type) of the resource
    // TODO it would be better to define type by the content marker in the stream
    $splittedurl = parse_url($url);
    $path = $splittedurl["path"];
    $path_parts = pathinfo($path);
    $tmpdir = $CFG->dataroot . '/temp';
    $tmpfilename = $tmpdir . "/remove.me";
    if (!isset($path_parts['extension'])) {
        $path_parts['extension'] = '';
    }
    // set user agent to trick some web sites
    ini_set('user_agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.5.2');
    switch (strtolower($path_parts['extension'])) {
        case "doc":
            // download and save;
            $infile = @file_get_contents($url, FILE_BINARY);
            if (strlen($infile) > 0) {
                file_put_contents($tmpfilename, $infile, FILE_BINARY);
                //check if file size is too large then don't download it
                //TODO adjust max size in settings
                if (filesize($tmpfilename) < $file_size) {
                    $result = html_entity_decode(doc2text($tmpfilename), null, 'UTF-8');
                } else {
                    echo "\nFile {$url} was not dowloaded because of its large size\n";
                    $result = "the file is  too large";
                }
                unlink($tmpfilename);
            } else {
                $result = "can't read TEXT from the remote MS-Word file located at " . $url;
            }
            return $result;
        case "docx":
            // download and save;
            $infile = @file_get_contents($url, FILE_BINARY);
            file_put_contents($tmpfilename, $infile, FILE_BINARY);
            $result = getTextFromZippedXML($tmpfilename, "word/document.xml");
            unlink($tmpfilename);
            return $result;
        case "txt":
            return file_get_contents($url);
        case "java":
            return file_get_contents($url);
        case "cpp":
            return file_get_contents($url);
        case "c":
            return file_get_contents($url);
        case "pdf":
            return pdf2text($url);
        case "ppt":
            return ppt2text($url);
        default:
            // assuming it is html file
            $idt = 0;
            $text2 = file_get_contents($url);
            while (empty($text2) && $idt < 3) {
                $idt++;
                echo "\nTrying to download {$url}. Attempt {$idt}\n";
                $text2 = file_get_contents($url);
            }
            preg_match('@<meta\\s+http-equiv="Content-Type"\\s+content="([\\w/]+)(;\\s+charset=([^\\s"]+))?@i', $text2, $matches);
            if (isset($matches[1])) {
                $mime = $matches[1];
            }
            if (isset($matches[3])) {
                $charset = $matches[3];
            } else {
                $charset = mb_detect_encoding($text2);
                $text2 = "Unknown Encoding! You might need to check the direct link" . $text2;
            }
            $text2 = str_replace("<br>", "\n", $text2);
            $text2 = str_replace("<br >", "\n", $text2);
            $text2 = str_replace("<br/>", "\n", $text2);
            $text2 = strip_html_tags($text2);
            $text2 = @iconv($charset, "utf-8", $text2);
            return $text2;
    }
    // get it and put in to temporary file
    // send to to tokenizer
}
예제 #3
0
function docx2text($filename)
{
    return getTextFromZippedXML($filename, "word/document.xml");
}