/** * Convert binary files to text and ensure the charset is UTF8 * * @param object $file moodle storedfile * @return content or false */ protected function get_clear_utf8_content($file) { $localewincharset = get_string('localewincharset', 'langconfig'); $filen = $file->get_filename(); $file_type = strtolower(substr($filen, strlen($filen) - 4, 4)); if (array_search($file_type, array('.pdf', '.rtf', '.odt', '.doc', 'docx'))) { $temp_file = $this->tempdir . "/{$filen}.tmp"; $file->copy_content_to($temp_file); switch ($file_type) { case '.pdf': $content = pdf2text($temp_file); break; case '.rtf': $content = textlib_get_instance()->entities_to_utf8(rtf2text($temp_file)); break; case '.odt': $content = getTextFromZippedXML($temp_file, 'content.xml'); break; case '.doc': $antiwordpath = $this->get_config('antiwordpath'); $magic = file_get_contents($temp_file, NULL, NULL, -1, 2); if ($magic === 'PK') { // It is really a docx $content = getTextFromZippedXML($temp_file, 'word/document.xml'); } else { if (empty($antiwordpath) || !is_executable($antiwordpath)) { $content = textlib_get_instance()->entities_to_utf8(doc2text($temp_file)); } else { $content = shell_exec($antiwordpath . ' -f -w 0 ' . escapeshellarg($temp_file)); if (empty($content)) { // antiword can not recognize this file $content = textlib_get_instance()->entities_to_utf8(doc2text($temp_file)); } } } break; case 'docx': $content = getTextFromZippedXML($temp_file, 'word/document.xml'); break; } unlink($temp_file); return $this->wordwrap($content, 80); } // Files no need to covert format go here $content = $file->get_content(); if (!mb_check_encoding($content, 'UTF-8')) { if (mb_check_encoding($content, $localewincharset)) { // Convert content charset to UTF-8 $content = textlib_get_instance()->convert($content, $localewincharset); } else { // Unknown charset, possible binary file. Skip it mtrace("\tSkip unknown charset/binary file " . $file->get_filepath() . $file->get_filename()); return false; } } return $content; }
</form> <?php /* * PHP 读取MS Word文件格式.doc中的文字 * Code By Jenen, http://blog.yuing.cn/ */ if (isset($_FILES['docfile'])) { $docfile = $_FILES['docfile']['tmp_name']; $docname = $_FILES['docfile']['name']; if (strtolower(end(explode(".", $docname))) != 'doc') { exit('请上传.doc格式的文件!'); } if (file_exists($docfile)) { echo "<h4>{$docname} 提取结果:</h4>"; echo "<p>"; echo nl2br(doc2text($docfile)); echo "</p>"; } } function doc2text($doc_file, $data_file = 'data.json') { if (!file_exists($data_file)) { die('function.doc2text: data file not found!'); } $common_chars = json_decode(file_get_contents($data_file), true); $fp = fopen($doc_file, 'r'); $raw_sectors = array(); $text = ''; $en_text = ''; $en_sectors = 0; $utf8_sectors = 0;
function getremotecontent($url) { global $CFG; $plagiarismsettings = (array) get_config('plagiarism'); $file_size = $plagiarismsettings['crot_max_file_size']; // analyze the extension (type) of the resource // TODO it would be better to define type by the content marker in the stream $splittedurl = parse_url($url); $path = $splittedurl["path"]; $path_parts = pathinfo($path); $tmpdir = $CFG->dataroot . '/temp'; $tmpfilename = $tmpdir . "/remove.me"; if (!isset($path_parts['extension'])) { $path_parts['extension'] = ''; } // set user agent to trick some web sites ini_set('user_agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.5.2'); switch (strtolower($path_parts['extension'])) { case "doc": // download and save; $infile = @file_get_contents($url, FILE_BINARY); if (strlen($infile) > 0) { file_put_contents($tmpfilename, $infile, FILE_BINARY); //check if file size is too large then don't download it //TODO adjust max size in settings if (filesize($tmpfilename) < $file_size) { $result = html_entity_decode(doc2text($tmpfilename), null, 'UTF-8'); } else { echo "\nFile {$url} was not dowloaded because of its large size\n"; $result = "the file is too large"; } unlink($tmpfilename); } else { $result = "can't read TEXT from the remote MS-Word file located at " . $url; } return $result; case "docx": // download and save; $infile = @file_get_contents($url, FILE_BINARY); file_put_contents($tmpfilename, $infile, FILE_BINARY); $result = getTextFromZippedXML($tmpfilename, "word/document.xml"); unlink($tmpfilename); return $result; case "txt": return file_get_contents($url); case "java": return file_get_contents($url); case "cpp": return file_get_contents($url); case "c": return file_get_contents($url); case "pdf": return pdf2text($url); case "ppt": return ppt2text($url); default: // assuming it is html file $idt = 0; $text2 = file_get_contents($url); while (empty($text2) && $idt < 3) { $idt++; echo "\nTrying to download {$url}. Attempt {$idt}\n"; $text2 = file_get_contents($url); } preg_match('@<meta\\s+http-equiv="Content-Type"\\s+content="([\\w/]+)(;\\s+charset=([^\\s"]+))?@i', $text2, $matches); if (isset($matches[1])) { $mime = $matches[1]; } if (isset($matches[3])) { $charset = $matches[3]; } else { $charset = mb_detect_encoding($text2); $text2 = "Unknown Encoding! You might need to check the direct link" . $text2; } $text2 = str_replace("<br>", "\n", $text2); $text2 = str_replace("<br >", "\n", $text2); $text2 = str_replace("<br/>", "\n", $text2); $text2 = strip_html_tags($text2); $text2 = @iconv($charset, "utf-8", $text2); return $text2; } // get it and put in to temporary file // send to to tokenizer }
<script src="../js/jquery-1.8.0.min.js" language="javascript"></script> <script src="../js/public.js" language="javascript"></script> <script src="../js/layer/layer.min.js" language="javascript"></script> <script src="../template/personalityblue/js/resume.js" language="javascript"></script> <script type="text/javascript" src="../toptophr.com/js/jscolor/jscolor.js"></script> </head> <body> <?php /* * PHP 读取MS Word文件格式.doc中的文字 * Code By Jenen, http://blog.yuing.cn/ */ if (file_exists($doc['picurl'])) { $doccontent = nl2br(doc2text($doc['picurl'])); } ?> <?php if ($_GET['ac'] == 'fx') { ?> <form action="savedoc2text.php" method="post" > <input name="id" value="<?php echo $_GET['expect_id']; ?> " type="hidden" > <textarea name="body" style="display:none"><?php echo $doccontent;