Example #1
0
<?php

// ambil informasi tentang file yang akan diindeks
$kd_dok = $_GET['no'];
$nama_file = $_GET['nama_file'];
$bahasa = $_GET['bahasa'];
$tabel_tf = "tf_document";
$nama_file = "files/" . $nama_file;
$tipe_file = substr($nama_file, strpos($nama_file, '.') + 1);
echo 'Tipe File : ' . $tipe_file . '<br>';
switch ($tipe_file) {
    case "txt":
        $kalimat = file_get_contents($nama_file, true);
        break;
    case "doc":
        $kalimat = parseWord($nama_file);
        break;
    case "pdf":
        $kalimat = pdf2text($nama_file);
        break;
    case "docx":
        $kalimat = docx2text($nama_file);
        break;
    case "odt":
        $kalimat = odt2text($nama_file);
        break;
}
echo 'Isi File Text ' . $kalimat;
// -----proses tokenising-----
$kata = tokenising($kalimat);
//---proses filtering---
<?php

/***************************************************************** 
This approach uses detection of NUL (chr(00)) and end line (chr(13)) 
to decide where the text is: 
- divide the file contents up by chr(13) 
- reject any slices containing a NUL 
- stitch the rest together again 
- clean up with a regular expression 
*****************************************************************/
function parseWord($userDoc)
{
    $fileHandle = fopen($userDoc, "r");
    $line = @fread($fileHandle, filesize($userDoc));
    $lines = explode(chr(0xd), $line);
    $outtext = "";
    foreach ($lines as $thisline) {
        $pos = strpos($thisline, chr(0x0));
        if ($pos !== FALSE || strlen($thisline) == 0) {
        } else {
            $outtext .= $thisline . " ";
        }
    }
    $outtext = preg_replace("/[^a-zA-Z0-9\\s\\,\\.\\-\n\r\t@\\/\\_\\(\\)]/", "", $outtext);
    return $outtext;
}
$userDoc = "verdic.docx";
$text = parseWord($userDoc);
echo $text;
Example #3
0
<?php

include_once $_SERVER['DOCUMENT_ROOT'] . '/inc/current_pg_function.php';
include_once $_SERVER['DOCUMENT_ROOT'] . '/inc/global-settings.php';
include_once $_SERVER['DOCUMENT_ROOT'] . '/inc/pre-function.php';
include_once $_SERVER['DOCUMENT_ROOT'] . '/inc/doc_reader.php';
$raw_data = file_get_contents('data.txt');
$data = unserialize($raw_data);
$file = $data['ref'] . '-letter.doc';
$doc = parseWord($file);
$firstname = $data['firstname'];
$lastname = $data['lastname'];
$fullname = $firstname . " " . $lastname;
$doc_title = $data['doc_title'];
$changeNameErrors = array();
$updates = false;
$changed = false;
$total_pgs = 0;
//pre($_POST);
if (isset($_POST['changeNameDate'])) {
    if (trim($_POST['firstname']) == "") {
        $changeNameErrors['fname'] = "Please enter your first name.";
    }
    if (trim($_POST['lastname']) == "") {
        $changeNameErrors['lname'] = "Please enter your last name.";
    }
    if (empty($changeNameErrors)) {
        $fname = trim($_POST['firstname']);
        $lname = trim($_POST['lastname']);
        if ($fname !== $data['firstname']) {
            $data['firstname'] = $fname;