コード例 #1
1
 /**
  * Convert binary files to text and ensure the charset is UTF8
  *
  * @param object $file moodle storedfile
  * @return content or false
  */
 protected function get_clear_utf8_content($file)
 {
     $localewincharset = get_string('localewincharset', 'langconfig');
     $filen = $file->get_filename();
     $file_type = strtolower(substr($filen, strlen($filen) - 4, 4));
     if (array_search($file_type, array('.pdf', '.rtf', '.odt', '.doc', 'docx'))) {
         $temp_file = $this->tempdir . "/{$filen}.tmp";
         $file->copy_content_to($temp_file);
         switch ($file_type) {
             case '.pdf':
                 $content = pdf2text($temp_file);
                 break;
             case '.rtf':
                 $content = textlib_get_instance()->entities_to_utf8(rtf2text($temp_file));
                 break;
             case '.odt':
                 $content = getTextFromZippedXML($temp_file, 'content.xml');
                 break;
             case '.doc':
                 $antiwordpath = $this->get_config('antiwordpath');
                 $magic = file_get_contents($temp_file, NULL, NULL, -1, 2);
                 if ($magic === 'PK') {
                     // It is really a docx
                     $content = getTextFromZippedXML($temp_file, 'word/document.xml');
                 } else {
                     if (empty($antiwordpath) || !is_executable($antiwordpath)) {
                         $content = textlib_get_instance()->entities_to_utf8(doc2text($temp_file));
                     } else {
                         $content = shell_exec($antiwordpath . ' -f -w 0 ' . escapeshellarg($temp_file));
                         if (empty($content)) {
                             // antiword can not recognize this file
                             $content = textlib_get_instance()->entities_to_utf8(doc2text($temp_file));
                         }
                     }
                 }
                 break;
             case 'docx':
                 $content = getTextFromZippedXML($temp_file, 'word/document.xml');
                 break;
         }
         unlink($temp_file);
         return $this->wordwrap($content, 80);
     }
     // Files no need to covert format go here
     $content = $file->get_content();
     if (!mb_check_encoding($content, 'UTF-8')) {
         if (mb_check_encoding($content, $localewincharset)) {
             // Convert content charset to UTF-8
             $content = textlib_get_instance()->convert($content, $localewincharset);
         } else {
             // Unknown charset, possible binary file. Skip it
             mtrace("\tSkip unknown charset/binary file " . $file->get_filepath() . $file->get_filename());
             return false;
         }
     }
     return $content;
 }
コード例 #2
0
function getremotecontent($url)
{
    global $CFG;
    $plagiarismsettings = (array) get_config('plagiarism');
    $file_size = $plagiarismsettings['crot_max_file_size'];
    // analyze the extension (type) of the resource
    // TODO it would be better to define type by the content marker in the stream
    $splittedurl = parse_url($url);
    $path = $splittedurl["path"];
    $path_parts = pathinfo($path);
    $tmpdir = $CFG->dataroot . '/temp';
    $tmpfilename = $tmpdir . "/remove.me";
    if (!isset($path_parts['extension'])) {
        $path_parts['extension'] = '';
    }
    // set user agent to trick some web sites
    ini_set('user_agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.5.2');
    switch (strtolower($path_parts['extension'])) {
        case "doc":
            // download and save;
            $infile = @file_get_contents($url, FILE_BINARY);
            if (strlen($infile) > 0) {
                file_put_contents($tmpfilename, $infile, FILE_BINARY);
                //check if file size is too large then don't download it
                //TODO adjust max size in settings
                if (filesize($tmpfilename) < $file_size) {
                    $result = html_entity_decode(doc2text($tmpfilename), null, 'UTF-8');
                } else {
                    echo "\nFile {$url} was not dowloaded because of its large size\n";
                    $result = "the file is  too large";
                }
                unlink($tmpfilename);
            } else {
                $result = "can't read TEXT from the remote MS-Word file located at " . $url;
            }
            return $result;
        case "docx":
            // download and save;
            $infile = @file_get_contents($url, FILE_BINARY);
            file_put_contents($tmpfilename, $infile, FILE_BINARY);
            $result = getTextFromZippedXML($tmpfilename, "word/document.xml");
            unlink($tmpfilename);
            return $result;
        case "txt":
            return file_get_contents($url);
        case "java":
            return file_get_contents($url);
        case "cpp":
            return file_get_contents($url);
        case "c":
            return file_get_contents($url);
        case "pdf":
            return pdf2text($url);
        case "ppt":
            return ppt2text($url);
        default:
            // assuming it is html file
            $idt = 0;
            $text2 = file_get_contents($url);
            while (empty($text2) && $idt < 3) {
                $idt++;
                echo "\nTrying to download {$url}. Attempt {$idt}\n";
                $text2 = file_get_contents($url);
            }
            preg_match('@<meta\\s+http-equiv="Content-Type"\\s+content="([\\w/]+)(;\\s+charset=([^\\s"]+))?@i', $text2, $matches);
            if (isset($matches[1])) {
                $mime = $matches[1];
            }
            if (isset($matches[3])) {
                $charset = $matches[3];
            } else {
                $charset = mb_detect_encoding($text2);
                $text2 = "Unknown Encoding! You might need to check the direct link" . $text2;
            }
            $text2 = str_replace("<br>", "\n", $text2);
            $text2 = str_replace("<br >", "\n", $text2);
            $text2 = str_replace("<br/>", "\n", $text2);
            $text2 = strip_html_tags($text2);
            $text2 = @iconv($charset, "utf-8", $text2);
            return $text2;
    }
    // get it and put in to temporary file
    // send to to tokenizer
}
コード例 #3
0
ファイル: text.php プロジェクト: rickterheide/text_import
<?php

if (isset($_FILES['text_import_file'])) {
    $tmp_name = $_FILES['text_import_file']['tmp_name'];
    $name = $_FILES['text_import_file']['name'];
    $type = $_FILES['text_import_file']['type'];
    $text = '';
    if ($type == 'text/plain') {
        // plain text
        $text = file_get_contents($tmp_name);
    } elseif ($type == 'application/pdf') {
        // pdf
        $text = pdf2text($tmp_name);
    } elseif ($type == 'application/octet-stream') {
        // docx
        $extension_explode = explode('.', $name);
        $extension = end($extension_explode);
        if ($extension == 'docx') {
            $text = docx2text($tmp_name);
        }
    } elseif ($type == 'application/vnd.oasis.opendocument.text') {
        // openoffice doc
        $text = odt2text($tmp_name);
    }
}
header('Content-type: text/html; charset=UTF-8');
echo strip_tags($text);
exit;
コード例 #4
0
ファイル: test.php プロジェクト: NaveenKS/Text-App
                continue;
            }
            $data = getDecodedStream($stream, $options);
            if (strlen($data)) {
                if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) {
                    $textContainers = @$textContainers[1];
                    getDirtyTexts($texts, $textContainers);
                } else {
                    getCharTransformations($transformations, $data);
                }
            }
        }
    }
    return getTextUsingTransformations($texts, $transformations);
}
$result = pdf2text('test1.pdf');
$car = explode(" ", $result);
//echo $result;
/*

$con=count($car);

$flag=0;
$pame="10CO95";

//echo strlen($pame)."11111111111";
$i=0;

while($i<$con)
{
$ca=0;
コード例 #5
0
<?php 
if (isset($_POST['subexam'])) {
    if ($_FILES['archivo']['type'] == "application/pdf") {
        //$destino = 'archivossubidos';   //nombre de carpeta.
        echo $dirRepositorio;
        copy($_FILES['archivo']['tmp_name'], $dirRepositorio . $_FILES['archivo']['name']);
        $diro = $dirRepositorio . $_FILES['archivo']['name'];
        //direccion donde esta ahora el pdf.
        $dird = $dirDocLimp . $_FILES['archivo']['name'] . ".txt";
        //direccion donde se guarda el txt. El archivo no limpio.
        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
        //$a = new PDF2Text();
        //$a->setFilename($diro);
        //$a->decodePDF();
        $contenido = "";
        $contenido = pdf2text($diro);
        //leer el archivo .txt
        if ($contenido != "") {
            $nomb = $_POST['nombre'];
            $contenido = strtolower($contenido);
            //a minusculas
            $p = array('/À/', '/Â/', '/Ã/', '/Ä/', '/Å/', '/È/', '/Ê/', '/Ë/', '/Ì/', '/Î/', '/Ï/', '/Ò/', '/Ô/', '/Õ/', '/Ö/', '/Ø/', '/Ù/', '/Û/', '/Ü/', '/Á/', '/É/', '/Í/', '/Ó/', '/Ú/', '/á/', '/é/', '/í/', '/ó/', '/ú/', '/à/', '/è/', '/ì/', '/ò/', '/ù/', '/â/', '/ê/', '/î/', '/ô/', '/û/', '/ä/', '/ë/', '/ï/', '/ö/', '/ü/', '/ã/', '/å/', '/õ/', '/ø/', '/ç/', '/ÿ/', '/Ñ/', '//', '/1/', '/2/', '/3/', '/4/', '/5/', '/6/', '/7/', '/8/', '/9/', '/0/');
            $r = array('a', 'a', 'a', 'a', 'a', 'e', 'e', 'e', 'i', 'i', 'i', 'o', 'o', 'o', 'o', 'o', 'u', 'u', 'u', 'a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u', 'a', 'a', 'o', 'o', 'c', 'y', 'ñ', '', '', '', '', '', '', '', '', '', '', '');
            $contenido = preg_replace($p, $r, $contenido);
            //reemplazar vocales con acentos, entre otros.
            $contenido = preg_replace("/[^A-Za-z0-9 '\n'ñ]/", "", $contenido);
            //quitar caracteres especiales.
            $stopwords_file = "stopwords.txt";
            //a esta funcion se le pasa la variable con el contenido limpio y el archivo que contiene las stopwords.
            //$contenido = stop_words($contenido, $stopwords_file);				//funcion que elimina todas las stopwords
            $contenido = str_replace("\n", " ", $contenido);
コード例 #6
0
 <?php 
require "../db/db.php";
require "pdf2text.php";
session_start();
$user_acct_id = $_SESSION['account_id'];
$query_account_type = mysql_query("SELECT * FROM accounts_t, account_type_t WHERE accounts_t.account_id = '{$user_acct_id}'\n                  AND accounts_t.account_type_id = account_type_t.account_type_id");
$query_account_type_row = mysql_fetch_assoc($query_account_type);
$user_employee_id = $query_account_type_row['employee_id'];
$user_account_type = $query_account_type_row['account_type_name'];
$array_of_doc = array();
$array_of_searched_doc = array();
$search_string = $_POST['search_string'];
$select_document_file = mysql_query("SELECT * FROM document_t") or die(mysql_error());
while ($select_document_file_row = mysql_fetch_assoc($select_document_file)) {
    $document_file = $select_document_file_row['document_filename'];
    $pdf_content = pdf2text('../documents/' . $document_file . '');
    $array_of_doc[$document_file] = $pdf_content;
    // $array_of_doc = array(''.$document_file.'' => ''.$pdf_content.'');
}
foreach ($array_of_doc as $key => $value) {
    if ($search_string != "") {
        $isThere = strpos(strtolower($value), strtolower($search_string));
        $isThereTitle = strpos(strtolower($key), strtolower($search_string));
        if ($isThere !== false || $isThereTitle !== false) {
            array_push($array_of_searched_doc, $key);
        }
    } else {
    }
}
?>
コード例 #7
0
 function addToSearchableObjects($wasNew = false)
 {
     $columns_to_drop = array();
     if ($wasNew) {
         $columns_to_drop = $this->getSearchableColumns();
     } else {
         $searchable_columns = $this->getSearchableColumns();
         if (is_array($searchable_columns)) {
             foreach ($searchable_columns as $column_name) {
                 if (isset($this->searchable_composite_columns[$column_name])) {
                     foreach ($this->searchable_composite_columns[$column_name] as $colName) {
                         if ($this->isColumnModified($colName)) {
                             $columns_to_drop[] = $column_name;
                             break;
                         }
                     }
                 } else {
                     if ($this->isColumnModified($column_name)) {
                         $columns_to_drop[] = $column_name;
                     }
                 }
             }
         }
         $searchable_columns = null;
     }
     if (count($columns_to_drop) > 0) {
         if (!$wasNew) {
             SearchableObjects::dropContentByObjectColumns($this, $columns_to_drop);
         }
         $docx_id = FileTypes::findOne(array('id' => true, 'conditions' => '`extension` = ' . DB::escape('docx')));
         $pdf_id = FileTypes::findOne(array('id' => true, 'conditions' => '`extension` = ' . DB::escape('pdf')));
         $odt_id = FileTypes::findOne(array('id' => true, 'conditions' => '`extension` = ' . DB::escape('odt')));
         $fodt_id = FileTypes::findOne(array('id' => true, 'conditions' => '`extension` = ' . DB::escape('fodt')));
         foreach ($columns_to_drop as $column_name) {
             $content = $this->getSearchableColumnContent($column_name);
             if (get_class($this->manager()) == 'ProjectFiles') {
                 $content = utf8_encode($content);
             } elseif (get_class($this->manager()) == 'ProjectFileRevisions') {
                 if ($column_name == "filecontent") {
                     $file = ProjectFileRevisions::findById($this->getObjectId());
                     try {
                         if ($file->getFileTypeId() == $docx_id) {
                             if (class_exists('DOMDocument')) {
                                 $file_path = "tmp/doc_filecontent_" . $this->getObjectId() . ".docx";
                                 $file_tmp = @fopen($file_path, 'w');
                                 if ($file_tmp) {
                                     fwrite($file_tmp, $file->getFileContent());
                                     fclose($file_tmp);
                                     $content = docx2text($file_path);
                                     unlink($file_path);
                                 }
                             }
                         } elseif ($file->getFileTypeId() == $pdf_id) {
                             $file_path = "tmp/pdf_filecontent_" . $this->getObjectId() . ".pdf";
                             $file_tmp = @fopen($file_path, 'w');
                             if ($file_tmp) {
                                 fwrite($file_tmp, $file->getFileContent());
                                 fclose($file_tmp);
                                 $content = pdf2text($file_path);
                                 unlink($file_path);
                             }
                         } elseif ($file->getFileTypeId() == $odt_id) {
                             if (class_exists('DOMDocument')) {
                                 $file_path = "tmp/odt_filecontent_" . $this->getObjectId() . ".odt";
                                 $file_tmp = @fopen($file_path, 'w');
                                 if ($file_tmp) {
                                     fwrite($file_tmp, $file->getFileContent());
                                     fclose($file_tmp);
                                     $content = odt2text($file_path);
                                     unlink($file_path);
                                 }
                             }
                         } elseif ($file->getFileTypeId() == $fodt_id) {
                             $file_path = "tmp/fodt_filecontent_" . $this->getObjectId() . ".fodt";
                             $file_tmp = @fopen($file_path, 'w');
                             if ($file_tmp) {
                                 fwrite($file_tmp, $file->getFileContent());
                                 fclose($file_tmp);
                                 $content = fodt2text($file_path, $this->getObjectId());
                                 unlink($file_path);
                             }
                         }
                     } catch (FileNotInRepositoryError $e) {
                         $content = "";
                     }
                 } else {
                     $content = utf8_encode($content);
                 }
             }
             if (trim($content) != '') {
                 $searchable_object = new SearchableObject();
                 $searchable_object->setRelObjectId($this->getObjectId());
                 $searchable_object->setColumnName(DB::escape($column_name));
                 if (strlen($content) > 65535) {
                     $content = utf8_safe(substr($content, 0, 65535));
                 }
                 $content = DB::escape($content);
                 $sql = "\n\t\t\t\t\t\tINSERT INTO " . TABLE_PREFIX . "searchable_objects (rel_object_id, column_name, content)\n\t\t\t\t\t\tVALUES (" . $searchable_object->getRelObjectId() . "," . $searchable_object->getColumnName() . "," . $content . ")\n\t\t\t\t\t\tON DUPLICATE KEY UPDATE content = {$content}";
                 DB::execute($sql);
                 $searchable_object = null;
             }
             $content = null;
         }
     }
     $columns_to_drop = null;
 }
コード例 #8
0
function readPdf($file, $uid)
{
    require_once 'include/database.php ';
    $result = pdf2text($file);
    $result = trim($result);
    $result_array = explode(' ', $result);
    $first_name = trim($result_array[0]);
    $last_name = $result_array[1];
    $skills_array = explode('Skills', $result);
    $skills = $skills_array[1];
    if (mysql_query("insert into resume (uid,first_name,last_name,skills,location) values ('{$uid}','{$first_name}','{$last_name}','{$skills}','{$file}')")) {
    }
    return true;
}
コード例 #9
0
ファイル: tf_dok.php プロジェクト: abbeet/server39
$kd_dok = $_GET['no'];
$nama_file = $_GET['nama_file'];
$bahasa = $_GET['bahasa'];
$tabel_tf = "tf_document";
$nama_file = "files/" . $nama_file;
$tipe_file = substr($nama_file, strpos($nama_file, '.') + 1);
echo 'Tipe File : ' . $tipe_file . '<br>';
switch ($tipe_file) {
    case "txt":
        $kalimat = file_get_contents($nama_file, true);
        break;
    case "doc":
        $kalimat = parseWord($nama_file);
        break;
    case "pdf":
        $kalimat = pdf2text($nama_file);
        break;
    case "docx":
        $kalimat = docx2text($nama_file);
        break;
    case "odt":
        $kalimat = odt2text($nama_file);
        break;
}
echo 'Isi File Text ' . $kalimat;
// -----proses tokenising-----
$kata = tokenising($kalimat);
//---proses filtering---
$hasil = filtering($kata, $bahasa);
//--- proses Stemming and calculate tf ---
if ($bahasa == "id") {
コード例 #10
0
ファイル: reg.php プロジェクト: NaveenKS/Text-App
                continue;
            }
            $data = getDecodedStream($stream, $options);
            if (strlen($data)) {
                if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) {
                    $textContainers = @$textContainers[1];
                    getDirtyTexts($texts, $textContainers);
                } else {
                    getCharTransformations($transformations, $data);
                }
            }
        }
    }
    return getTextUsingTransformations($texts, $transformations);
}
$result = pdf2text($search[0]);
$car = explode(" ", $result);
//echo $result;
/*

$con=count($car);

$flag=0;
$pame="10CO95";

//echo strlen($pame)."11111111111";
$i=0;

while($i<$con)
{
$ca=0;