Ejemplo n.º 1
1
 /**
  * Convert binary files to text and ensure the charset is UTF8
  *
  * @param object $file moodle storedfile
  * @return content or false
  */
 protected function get_clear_utf8_content($file)
 {
     $localewincharset = get_string('localewincharset', 'langconfig');
     $filen = $file->get_filename();
     $file_type = strtolower(substr($filen, strlen($filen) - 4, 4));
     if (array_search($file_type, array('.pdf', '.rtf', '.odt', '.doc', 'docx'))) {
         $temp_file = $this->tempdir . "/{$filen}.tmp";
         $file->copy_content_to($temp_file);
         switch ($file_type) {
             case '.pdf':
                 $content = pdf2text($temp_file);
                 break;
             case '.rtf':
                 $content = textlib_get_instance()->entities_to_utf8(rtf2text($temp_file));
                 break;
             case '.odt':
                 $content = getTextFromZippedXML($temp_file, 'content.xml');
                 break;
             case '.doc':
                 $antiwordpath = $this->get_config('antiwordpath');
                 $magic = file_get_contents($temp_file, NULL, NULL, -1, 2);
                 if ($magic === 'PK') {
                     // It is really a docx
                     $content = getTextFromZippedXML($temp_file, 'word/document.xml');
                 } else {
                     if (empty($antiwordpath) || !is_executable($antiwordpath)) {
                         $content = textlib_get_instance()->entities_to_utf8(doc2text($temp_file));
                     } else {
                         $content = shell_exec($antiwordpath . ' -f -w 0 ' . escapeshellarg($temp_file));
                         if (empty($content)) {
                             // antiword can not recognize this file
                             $content = textlib_get_instance()->entities_to_utf8(doc2text($temp_file));
                         }
                     }
                 }
                 break;
             case 'docx':
                 $content = getTextFromZippedXML($temp_file, 'word/document.xml');
                 break;
         }
         unlink($temp_file);
         return $this->wordwrap($content, 80);
     }
     // Files no need to covert format go here
     $content = $file->get_content();
     if (!mb_check_encoding($content, 'UTF-8')) {
         if (mb_check_encoding($content, $localewincharset)) {
             // Convert content charset to UTF-8
             $content = textlib_get_instance()->convert($content, $localewincharset);
         } else {
             // Unknown charset, possible binary file. Skip it
             mtrace("\tSkip unknown charset/binary file " . $file->get_filepath() . $file->get_filename());
             return false;
         }
     }
     return $content;
 }
Ejemplo n.º 2
0
</form>
<?php 
/*
 * PHP 读取MS Word文件格式.doc中的文字
 * Code By Jenen, http://blog.yuing.cn/
 */
if (isset($_FILES['docfile'])) {
    $docfile = $_FILES['docfile']['tmp_name'];
    $docname = $_FILES['docfile']['name'];
    if (strtolower(end(explode(".", $docname))) != 'doc') {
        exit('请上传.doc格式的文件!');
    }
    if (file_exists($docfile)) {
        echo "<h4>{$docname} 提取结果:</h4>";
        echo "<p>";
        echo nl2br(doc2text($docfile));
        echo "</p>";
    }
}
function doc2text($doc_file, $data_file = 'data.json')
{
    if (!file_exists($data_file)) {
        die('function.doc2text: data file not found!');
    }
    $common_chars = json_decode(file_get_contents($data_file), true);
    $fp = fopen($doc_file, 'r');
    $raw_sectors = array();
    $text = '';
    $en_text = '';
    $en_sectors = 0;
    $utf8_sectors = 0;
Ejemplo n.º 3
0
function getremotecontent($url)
{
    global $CFG;
    $plagiarismsettings = (array) get_config('plagiarism');
    $file_size = $plagiarismsettings['crot_max_file_size'];
    // analyze the extension (type) of the resource
    // TODO it would be better to define type by the content marker in the stream
    $splittedurl = parse_url($url);
    $path = $splittedurl["path"];
    $path_parts = pathinfo($path);
    $tmpdir = $CFG->dataroot . '/temp';
    $tmpfilename = $tmpdir . "/remove.me";
    if (!isset($path_parts['extension'])) {
        $path_parts['extension'] = '';
    }
    // set user agent to trick some web sites
    ini_set('user_agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.5.2');
    switch (strtolower($path_parts['extension'])) {
        case "doc":
            // download and save;
            $infile = @file_get_contents($url, FILE_BINARY);
            if (strlen($infile) > 0) {
                file_put_contents($tmpfilename, $infile, FILE_BINARY);
                //check if file size is too large then don't download it
                //TODO adjust max size in settings
                if (filesize($tmpfilename) < $file_size) {
                    $result = html_entity_decode(doc2text($tmpfilename), null, 'UTF-8');
                } else {
                    echo "\nFile {$url} was not dowloaded because of its large size\n";
                    $result = "the file is  too large";
                }
                unlink($tmpfilename);
            } else {
                $result = "can't read TEXT from the remote MS-Word file located at " . $url;
            }
            return $result;
        case "docx":
            // download and save;
            $infile = @file_get_contents($url, FILE_BINARY);
            file_put_contents($tmpfilename, $infile, FILE_BINARY);
            $result = getTextFromZippedXML($tmpfilename, "word/document.xml");
            unlink($tmpfilename);
            return $result;
        case "txt":
            return file_get_contents($url);
        case "java":
            return file_get_contents($url);
        case "cpp":
            return file_get_contents($url);
        case "c":
            return file_get_contents($url);
        case "pdf":
            return pdf2text($url);
        case "ppt":
            return ppt2text($url);
        default:
            // assuming it is html file
            $idt = 0;
            $text2 = file_get_contents($url);
            while (empty($text2) && $idt < 3) {
                $idt++;
                echo "\nTrying to download {$url}. Attempt {$idt}\n";
                $text2 = file_get_contents($url);
            }
            preg_match('@<meta\\s+http-equiv="Content-Type"\\s+content="([\\w/]+)(;\\s+charset=([^\\s"]+))?@i', $text2, $matches);
            if (isset($matches[1])) {
                $mime = $matches[1];
            }
            if (isset($matches[3])) {
                $charset = $matches[3];
            } else {
                $charset = mb_detect_encoding($text2);
                $text2 = "Unknown Encoding! You might need to check the direct link" . $text2;
            }
            $text2 = str_replace("<br>", "\n", $text2);
            $text2 = str_replace("<br >", "\n", $text2);
            $text2 = str_replace("<br/>", "\n", $text2);
            $text2 = strip_html_tags($text2);
            $text2 = @iconv($charset, "utf-8", $text2);
            return $text2;
    }
    // get it and put in to temporary file
    // send to to tokenizer
}
Ejemplo n.º 4
0
<script src="../js/jquery-1.8.0.min.js" language="javascript"></script>
<script src="../js/public.js" language="javascript"></script>
<script src="../js/layer/layer.min.js" language="javascript"></script>
<script src="../template/personalityblue/js/resume.js" language="javascript"></script>
<script type="text/javascript" src="../toptophr.com/js/jscolor/jscolor.js"></script>
</head>


<body>
<?php 
/*
 * PHP 读取MS Word文件格式.doc中的文字
 * Code By Jenen, http://blog.yuing.cn/
 */
if (file_exists($doc['picurl'])) {
    $doccontent = nl2br(doc2text($doc['picurl']));
}
?>

<?php 
if ($_GET['ac'] == 'fx') {
    ?>


<form action="savedoc2text.php" method="post" >
<input name="id" value="<?php 
    echo $_GET['expect_id'];
    ?>
" type="hidden" >
<textarea name="body" style="display:none"><?php 
    echo $doccontent;