/** * Convert binary files to text and ensure the charset is UTF8 * * @param object $file moodle storedfile * @return content or false */ protected function get_clear_utf8_content($file) { $localewincharset = get_string('localewincharset', 'langconfig'); $filen = $file->get_filename(); $file_type = strtolower(substr($filen, strlen($filen) - 4, 4)); if (array_search($file_type, array('.pdf', '.rtf', '.odt', '.doc', 'docx'))) { $temp_file = $this->tempdir . "/{$filen}.tmp"; $file->copy_content_to($temp_file); switch ($file_type) { case '.pdf': $content = pdf2text($temp_file); break; case '.rtf': $content = textlib_get_instance()->entities_to_utf8(rtf2text($temp_file)); break; case '.odt': $content = getTextFromZippedXML($temp_file, 'content.xml'); break; case '.doc': $antiwordpath = $this->get_config('antiwordpath'); $magic = file_get_contents($temp_file, NULL, NULL, -1, 2); if ($magic === 'PK') { // It is really a docx $content = getTextFromZippedXML($temp_file, 'word/document.xml'); } else { if (empty($antiwordpath) || !is_executable($antiwordpath)) { $content = textlib_get_instance()->entities_to_utf8(doc2text($temp_file)); } else { $content = shell_exec($antiwordpath . ' -f -w 0 ' . escapeshellarg($temp_file)); if (empty($content)) { // antiword can not recognize this file $content = textlib_get_instance()->entities_to_utf8(doc2text($temp_file)); } } } break; case 'docx': $content = getTextFromZippedXML($temp_file, 'word/document.xml'); break; } unlink($temp_file); return $this->wordwrap($content, 80); } // Files no need to covert format go here $content = $file->get_content(); if (!mb_check_encoding($content, 'UTF-8')) { if (mb_check_encoding($content, $localewincharset)) { // Convert content charset to UTF-8 $content = textlib_get_instance()->convert($content, $localewincharset); } else { // Unknown charset, possible binary file. Skip it mtrace("\tSkip unknown charset/binary file " . $file->get_filepath() . $file->get_filename()); return false; } } return $content; }
function tokenizer($path, $extension) { global $CFG; if (is_readable($path)) { // USE extension to choose tokenizer // $path_parts = pathinfo($path); // switch (strtolower($path_parts['extension'])): switch (strtolower($extension)) { case "pdf": $result = pdf2text($path); return $result; case "doc": $result = html_entity_decode(doc2text($path), null, 'UTF-8'); return $result; case "docx": $result = getTextFromZippedXML($path, "word/document.xml"); return $result; case "odt": $result = getTextFromZippedXML($path, "content.xml"); return $result; case "rtf": $result = rtf2text($path); return $result; case "txt": return file_get_contents($path); case "cpp": return file_get_contents($path); case "java": return file_get_contents($path); default: return "unknown file type"; } } }
/** * Given the path to an rtf document returns a lucene document with $filename and $contents set appropriately. * @param $path * @return Zend_Search_Lucene_Document */ function createRTFDocument($path) { $doc = new Zend_Search_Lucene_Document(); $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path))); $contents = rtf2text($path); //print_r($contents); $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', $contents)); return $doc; }