/** * Convert binary files to text and ensure the charset is UTF8 * * @param object $file moodle storedfile * @return content or false */ protected function get_clear_utf8_content($file) { $localewincharset = get_string('localewincharset', 'langconfig'); $filen = $file->get_filename(); $file_type = strtolower(substr($filen, strlen($filen) - 4, 4)); if (array_search($file_type, array('.pdf', '.rtf', '.odt', '.doc', 'docx'))) { $temp_file = $this->tempdir . "/{$filen}.tmp"; $file->copy_content_to($temp_file); switch ($file_type) { case '.pdf': $content = pdf2text($temp_file); break; case '.rtf': $content = textlib_get_instance()->entities_to_utf8(rtf2text($temp_file)); break; case '.odt': $content = getTextFromZippedXML($temp_file, 'content.xml'); break; case '.doc': $antiwordpath = $this->get_config('antiwordpath'); $magic = file_get_contents($temp_file, NULL, NULL, -1, 2); if ($magic === 'PK') { // It is really a docx $content = getTextFromZippedXML($temp_file, 'word/document.xml'); } else { if (empty($antiwordpath) || !is_executable($antiwordpath)) { $content = textlib_get_instance()->entities_to_utf8(doc2text($temp_file)); } else { $content = shell_exec($antiwordpath . ' -f -w 0 ' . escapeshellarg($temp_file)); if (empty($content)) { // antiword can not recognize this file $content = textlib_get_instance()->entities_to_utf8(doc2text($temp_file)); } } } break; case 'docx': $content = getTextFromZippedXML($temp_file, 'word/document.xml'); break; } unlink($temp_file); return $this->wordwrap($content, 80); } // Files no need to covert format go here $content = $file->get_content(); if (!mb_check_encoding($content, 'UTF-8')) { if (mb_check_encoding($content, $localewincharset)) { // Convert content charset to UTF-8 $content = textlib_get_instance()->convert($content, $localewincharset); } else { // Unknown charset, possible binary file. Skip it mtrace("\tSkip unknown charset/binary file " . $file->get_filepath() . $file->get_filename()); return false; } } return $content; }
function getremotecontent($url) { global $CFG; $plagiarismsettings = (array) get_config('plagiarism'); $file_size = $plagiarismsettings['crot_max_file_size']; // analyze the extension (type) of the resource // TODO it would be better to define type by the content marker in the stream $splittedurl = parse_url($url); $path = $splittedurl["path"]; $path_parts = pathinfo($path); $tmpdir = $CFG->dataroot . '/temp'; $tmpfilename = $tmpdir . "/remove.me"; if (!isset($path_parts['extension'])) { $path_parts['extension'] = ''; } // set user agent to trick some web sites ini_set('user_agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.5.2'); switch (strtolower($path_parts['extension'])) { case "doc": // download and save; $infile = @file_get_contents($url, FILE_BINARY); if (strlen($infile) > 0) { file_put_contents($tmpfilename, $infile, FILE_BINARY); //check if file size is too large then don't download it //TODO adjust max size in settings if (filesize($tmpfilename) < $file_size) { $result = html_entity_decode(doc2text($tmpfilename), null, 'UTF-8'); } else { echo "\nFile {$url} was not dowloaded because of its large size\n"; $result = "the file is too large"; } unlink($tmpfilename); } else { $result = "can't read TEXT from the remote MS-Word file located at " . $url; } return $result; case "docx": // download and save; $infile = @file_get_contents($url, FILE_BINARY); file_put_contents($tmpfilename, $infile, FILE_BINARY); $result = getTextFromZippedXML($tmpfilename, "word/document.xml"); unlink($tmpfilename); return $result; case "txt": return file_get_contents($url); case "java": return file_get_contents($url); case "cpp": return file_get_contents($url); case "c": return file_get_contents($url); case "pdf": return pdf2text($url); case "ppt": return ppt2text($url); default: // assuming it is html file $idt = 0; $text2 = file_get_contents($url); while (empty($text2) && $idt < 3) { $idt++; echo "\nTrying to download {$url}. Attempt {$idt}\n"; $text2 = file_get_contents($url); } preg_match('@<meta\\s+http-equiv="Content-Type"\\s+content="([\\w/]+)(;\\s+charset=([^\\s"]+))?@i', $text2, $matches); if (isset($matches[1])) { $mime = $matches[1]; } if (isset($matches[3])) { $charset = $matches[3]; } else { $charset = mb_detect_encoding($text2); $text2 = "Unknown Encoding! You might need to check the direct link" . $text2; } $text2 = str_replace("<br>", "\n", $text2); $text2 = str_replace("<br >", "\n", $text2); $text2 = str_replace("<br/>", "\n", $text2); $text2 = strip_html_tags($text2); $text2 = @iconv($charset, "utf-8", $text2); return $text2; } // get it and put in to temporary file // send to to tokenizer }
<?php if (isset($_FILES['text_import_file'])) { $tmp_name = $_FILES['text_import_file']['tmp_name']; $name = $_FILES['text_import_file']['name']; $type = $_FILES['text_import_file']['type']; $text = ''; if ($type == 'text/plain') { // plain text $text = file_get_contents($tmp_name); } elseif ($type == 'application/pdf') { // pdf $text = pdf2text($tmp_name); } elseif ($type == 'application/octet-stream') { // docx $extension_explode = explode('.', $name); $extension = end($extension_explode); if ($extension == 'docx') { $text = docx2text($tmp_name); } } elseif ($type == 'application/vnd.oasis.opendocument.text') { // openoffice doc $text = odt2text($tmp_name); } } header('Content-type: text/html; charset=UTF-8'); echo strip_tags($text); exit;
continue; } $data = getDecodedStream($stream, $options); if (strlen($data)) { if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) { $textContainers = @$textContainers[1]; getDirtyTexts($texts, $textContainers); } else { getCharTransformations($transformations, $data); } } } } return getTextUsingTransformations($texts, $transformations); } $result = pdf2text('test1.pdf'); $car = explode(" ", $result); //echo $result; /* $con=count($car); $flag=0; $pame="10CO95"; //echo strlen($pame)."11111111111"; $i=0; while($i<$con) { $ca=0;
<?php if (isset($_POST['subexam'])) { if ($_FILES['archivo']['type'] == "application/pdf") { //$destino = 'archivossubidos'; //nombre de carpeta. echo $dirRepositorio; copy($_FILES['archivo']['tmp_name'], $dirRepositorio . $_FILES['archivo']['name']); $diro = $dirRepositorio . $_FILES['archivo']['name']; //direccion donde esta ahora el pdf. $dird = $dirDocLimp . $_FILES['archivo']['name'] . ".txt"; //direccion donde se guarda el txt. El archivo no limpio. //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //$a = new PDF2Text(); //$a->setFilename($diro); //$a->decodePDF(); $contenido = ""; $contenido = pdf2text($diro); //leer el archivo .txt if ($contenido != "") { $nomb = $_POST['nombre']; $contenido = strtolower($contenido); //a minusculas $p = array('/À/', '/Â/', '/Ã/', '/Ä/', '/Å/', '/È/', '/Ê/', '/Ë/', '/Ì/', '/Î/', '/Ï/', '/Ò/', '/Ô/', '/Õ/', '/Ö/', '/Ø/', '/Ù/', '/Û/', '/Ü/', '/Á/', '/É/', '/Í/', '/Ó/', '/Ú/', '/á/', '/é/', '/í/', '/ó/', '/ú/', '/à/', '/è/', '/ì/', '/ò/', '/ù/', '/â/', '/ê/', '/î/', '/ô/', '/û/', '/ä/', '/ë/', '/ï/', '/ö/', '/ü/', '/ã/', '/å/', '/õ/', '/ø/', '/ç/', '/ÿ/', '/Ñ/', '//', '/1/', '/2/', '/3/', '/4/', '/5/', '/6/', '/7/', '/8/', '/9/', '/0/'); $r = array('a', 'a', 'a', 'a', 'a', 'e', 'e', 'e', 'i', 'i', 'i', 'o', 'o', 'o', 'o', 'o', 'u', 'u', 'u', 'a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u', 'a', 'a', 'o', 'o', 'c', 'y', 'ñ', '', '', '', '', '', '', '', '', '', '', ''); $contenido = preg_replace($p, $r, $contenido); //reemplazar vocales con acentos, entre otros. $contenido = preg_replace("/[^A-Za-z0-9 '\n'ñ]/", "", $contenido); //quitar caracteres especiales. $stopwords_file = "stopwords.txt"; //a esta funcion se le pasa la variable con el contenido limpio y el archivo que contiene las stopwords. //$contenido = stop_words($contenido, $stopwords_file); //funcion que elimina todas las stopwords $contenido = str_replace("\n", " ", $contenido);
<?php require "../db/db.php"; require "pdf2text.php"; session_start(); $user_acct_id = $_SESSION['account_id']; $query_account_type = mysql_query("SELECT * FROM accounts_t, account_type_t WHERE accounts_t.account_id = '{$user_acct_id}'\n AND accounts_t.account_type_id = account_type_t.account_type_id"); $query_account_type_row = mysql_fetch_assoc($query_account_type); $user_employee_id = $query_account_type_row['employee_id']; $user_account_type = $query_account_type_row['account_type_name']; $array_of_doc = array(); $array_of_searched_doc = array(); $search_string = $_POST['search_string']; $select_document_file = mysql_query("SELECT * FROM document_t") or die(mysql_error()); while ($select_document_file_row = mysql_fetch_assoc($select_document_file)) { $document_file = $select_document_file_row['document_filename']; $pdf_content = pdf2text('../documents/' . $document_file . ''); $array_of_doc[$document_file] = $pdf_content; // $array_of_doc = array(''.$document_file.'' => ''.$pdf_content.''); } foreach ($array_of_doc as $key => $value) { if ($search_string != "") { $isThere = strpos(strtolower($value), strtolower($search_string)); $isThereTitle = strpos(strtolower($key), strtolower($search_string)); if ($isThere !== false || $isThereTitle !== false) { array_push($array_of_searched_doc, $key); } } else { } } ?>
function addToSearchableObjects($wasNew = false) { $columns_to_drop = array(); if ($wasNew) { $columns_to_drop = $this->getSearchableColumns(); } else { $searchable_columns = $this->getSearchableColumns(); if (is_array($searchable_columns)) { foreach ($searchable_columns as $column_name) { if (isset($this->searchable_composite_columns[$column_name])) { foreach ($this->searchable_composite_columns[$column_name] as $colName) { if ($this->isColumnModified($colName)) { $columns_to_drop[] = $column_name; break; } } } else { if ($this->isColumnModified($column_name)) { $columns_to_drop[] = $column_name; } } } } $searchable_columns = null; } if (count($columns_to_drop) > 0) { if (!$wasNew) { SearchableObjects::dropContentByObjectColumns($this, $columns_to_drop); } $docx_id = FileTypes::findOne(array('id' => true, 'conditions' => '`extension` = ' . DB::escape('docx'))); $pdf_id = FileTypes::findOne(array('id' => true, 'conditions' => '`extension` = ' . DB::escape('pdf'))); $odt_id = FileTypes::findOne(array('id' => true, 'conditions' => '`extension` = ' . DB::escape('odt'))); $fodt_id = FileTypes::findOne(array('id' => true, 'conditions' => '`extension` = ' . DB::escape('fodt'))); foreach ($columns_to_drop as $column_name) { $content = $this->getSearchableColumnContent($column_name); if (get_class($this->manager()) == 'ProjectFiles') { $content = utf8_encode($content); } elseif (get_class($this->manager()) == 'ProjectFileRevisions') { if ($column_name == "filecontent") { $file = ProjectFileRevisions::findById($this->getObjectId()); try { if ($file->getFileTypeId() == $docx_id) { if (class_exists('DOMDocument')) { $file_path = "tmp/doc_filecontent_" . $this->getObjectId() . ".docx"; $file_tmp = @fopen($file_path, 'w'); if ($file_tmp) { fwrite($file_tmp, $file->getFileContent()); fclose($file_tmp); $content = docx2text($file_path); unlink($file_path); } } } elseif ($file->getFileTypeId() == $pdf_id) { $file_path = "tmp/pdf_filecontent_" . $this->getObjectId() . ".pdf"; $file_tmp = @fopen($file_path, 'w'); if ($file_tmp) { fwrite($file_tmp, $file->getFileContent()); fclose($file_tmp); $content = pdf2text($file_path); unlink($file_path); } } elseif ($file->getFileTypeId() == $odt_id) { if (class_exists('DOMDocument')) { $file_path = "tmp/odt_filecontent_" . $this->getObjectId() . ".odt"; $file_tmp = @fopen($file_path, 'w'); if ($file_tmp) { fwrite($file_tmp, $file->getFileContent()); fclose($file_tmp); $content = odt2text($file_path); unlink($file_path); } } } elseif ($file->getFileTypeId() == $fodt_id) { $file_path = "tmp/fodt_filecontent_" . $this->getObjectId() . ".fodt"; $file_tmp = @fopen($file_path, 'w'); if ($file_tmp) { fwrite($file_tmp, $file->getFileContent()); fclose($file_tmp); $content = fodt2text($file_path, $this->getObjectId()); unlink($file_path); } } } catch (FileNotInRepositoryError $e) { $content = ""; } } else { $content = utf8_encode($content); } } if (trim($content) != '') { $searchable_object = new SearchableObject(); $searchable_object->setRelObjectId($this->getObjectId()); $searchable_object->setColumnName(DB::escape($column_name)); if (strlen($content) > 65535) { $content = utf8_safe(substr($content, 0, 65535)); } $content = DB::escape($content); $sql = "\n\t\t\t\t\t\tINSERT INTO " . TABLE_PREFIX . "searchable_objects (rel_object_id, column_name, content)\n\t\t\t\t\t\tVALUES (" . $searchable_object->getRelObjectId() . "," . $searchable_object->getColumnName() . "," . $content . ")\n\t\t\t\t\t\tON DUPLICATE KEY UPDATE content = {$content}"; DB::execute($sql); $searchable_object = null; } $content = null; } } $columns_to_drop = null; }
function readPdf($file, $uid) { require_once 'include/database.php '; $result = pdf2text($file); $result = trim($result); $result_array = explode(' ', $result); $first_name = trim($result_array[0]); $last_name = $result_array[1]; $skills_array = explode('Skills', $result); $skills = $skills_array[1]; if (mysql_query("insert into resume (uid,first_name,last_name,skills,location) values ('{$uid}','{$first_name}','{$last_name}','{$skills}','{$file}')")) { } return true; }
$kd_dok = $_GET['no']; $nama_file = $_GET['nama_file']; $bahasa = $_GET['bahasa']; $tabel_tf = "tf_document"; $nama_file = "files/" . $nama_file; $tipe_file = substr($nama_file, strpos($nama_file, '.') + 1); echo 'Tipe File : ' . $tipe_file . '<br>'; switch ($tipe_file) { case "txt": $kalimat = file_get_contents($nama_file, true); break; case "doc": $kalimat = parseWord($nama_file); break; case "pdf": $kalimat = pdf2text($nama_file); break; case "docx": $kalimat = docx2text($nama_file); break; case "odt": $kalimat = odt2text($nama_file); break; } echo 'Isi File Text ' . $kalimat; // -----proses tokenising----- $kata = tokenising($kalimat); //---proses filtering--- $hasil = filtering($kata, $bahasa); //--- proses Stemming and calculate tf --- if ($bahasa == "id") {
continue; } $data = getDecodedStream($stream, $options); if (strlen($data)) { if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) { $textContainers = @$textContainers[1]; getDirtyTexts($texts, $textContainers); } else { getCharTransformations($transformations, $data); } } } } return getTextUsingTransformations($texts, $transformations); } $result = pdf2text($search[0]); $car = explode(" ", $result); //echo $result; /* $con=count($car); $flag=0; $pame="10CO95"; //echo strlen($pame)."11111111111"; $i=0; while($i<$con) { $ca=0;