<?php require_once '../../../classes/CreateDocx.inc'; $options = array('paragraph' => true, 'list' => true, 'table' => true, 'footnote' => true, 'endnote' => true, 'chart' => 0); CreateDocx::DOCX2TXT('../../files/Text.docx', 'document_1.txt', $options);
function extract_text($file, $file0, $source_type, $url, $chrSet) { global $db_con, $tmp_dir, $pdftotext_path, $catdoc_path, $xls2csv_path, $op_system, $mb, $debug; global $catppt_path, $home_charset, $command_line, $no_log, $clear, $converter_dir, $cl, $index_xmeta; $result = array(); $home_charset1 = str_ireplace('iso-', '', $home_charset); $charset_int = str_ireplace('iso', '', $home_charset1); $temp_file = "tmp_file"; $filename = $tmp_dir . "/" . $temp_file; if ($source_type == 'ods') { $filename .= "." . $source_type . ""; } if ($source_type == 'doc') { $filename .= "." . $source_type . ""; } if ($source_type == 'docx') { $filename .= "." . $source_type . ""; } if ($source_type == 'xlsx') { $filename .= "." . $source_type . ""; } if (!($handle = fopen($filename, 'w'))) { die("Cannot open file {$filename} in temp folder"); } mysqltest(); if (fwrite($handle, $file) === FALSE) { die("Cannot write to file {$filename} in temp folder"); } fclose($handle); mysqltest(); // for PDF documents enter here if ($source_type == 'pdf') { /* include('../converter/pdf2text.php'); $a = new PDF2Text(); $a->setFilename($filename); $a->decodePDF(); echo $a->output(); */ if (!($handle = fopen($pdftotext_path, 'rb'))) { printStandardReport('errorNoPDFConv', $command_line); $result[] = 'ERROR'; } else { // prepare command line for PDF converter if ($op_system != 'win') { $command = "" . $pdftotext_path . " -enc UTF-8 " . $filename . ""; } else { $command = "" . $pdftotext_path . " -cfg xpdfrc " . $filename . " -"; } $a = exec($command, $result, $retval); // convert the PDF document if ($retval != '0') { // error handler for PDF file converter if ($retval == '1' || $retval == '3' || $retval == '127') { if ($retval == '1') { printStandardReport('errorOpenPDF', $command_line); } if ($retval == '3') { printStandardReport('permissionError', $command_line); } if ($retval == '127') { printStandardReport('noConverter', $command_line); } } else { printStandardReport('ufoError', $command_line); } $result[] = 'ERROR'; } $result = implode(' ', $result); } // for DOC and RTF files enter here } else { if ($source_type == 'doc' || $source_type == 'rtf') { /* echo "\r\n\r\n<br /> op_system: '$op_system'<br />\r\n"; echo "\r\n\r\n<br /> catdoc_path: '$catdoc_path'<br />\r\n"; echo "\r\n\r\n<br /> charset_int: '$charset_int'<br />\r\n"; echo "\r\n\r\n<br /> filename: '$filename'<br />\r\n"; */ if ($op_system == 'win') { $command = "" . $catdoc_path . " -s " . $charset_int . " -d utf-8 -x " . $filename . ""; $a = exec($command, $result, $retval); if (stristr($result[0], "catdoc.exe")) { printDocReport($result[0], $cl); } } else { $message = " Indexing of .doc and .rtf documents is currently not supported on LINUX OS."; printDocReport($message, $cl); /* $retval = ''; $catdoc_path = str_ireplace("catdoc.exe", "catdoc.lin", $catdoc_path); //$command = "".$catdoc_path." -cfg xpdfrc ".$filename." -"; $command = "".$catdoc_path." -s ".$charset_int." -d utf-8 -w -x ".$filename.""; $a = exec($command, $result, $retval); // convert the DOC document //echo "\r\n\r\n<br /> retval: '$retval'<br />\r\n"; //echo "\r\n\r\n<br>result Array:<br><pre>";print_r($result);echo "</pre>\r\n"; if ($retval) { $result = 'ERROR'; //echo "\r\n\r\n<br /> retval: '$retval'<br />\r\n"; if($retval == '2') { $message = " File to be converted not found"; printDocReport($message, $cl); } else if($retval == '3') { $message = " Path to file not found."; printDocReport($message, $cl); } else if($retval == '11') { $message = " The executable is corrupted."; printDocReport($message, $cl); } else if($retval == '12') { $message = " Out of memory execution."; printDocReport($message, $cl); } else if($retval == '22') { $message = " dll error"; printDocReport($message, $cl); } else if($retval == '31') { $message = " The association is missing, use Shell to try the OpenWith dialog."; printDocReport($message, $cl); } else if($retval == '32') { $message = " File could not be opened."; printDocReport($message, $cl); } else if($retval == '126') { $message = " Command invoked cannot execute (Permission problem or command is not an executable)."; printDocReport($message, $cl); } else if($retval == 127) { $message = " Command not found."; printDocReport($message, $cl); } else if($retval == 128) { $message = " Invalid argument to exit. Exit takes only integer range 0 – 255."; printDocReport($message, $cl); } else if($retval > 128 && $retval < 255) { $message = " Fatal error code $retval"; printDocReport($message, $cl); } else if($retval == 255) { $message = " Exit status out of range. Exit takes only integer range 0 – 255."; printDocReport($message, $cl); } else { $message = " Unknown error code $retval."; printDocReport($message, $cl); } } */ } // for PPT files enter here } else { if ($source_type == 'ppt') { // currently unsupported,as a failure was encountered for large PowerPoint presentations $a = ''; /* $command = $catppt_path." -s $charset_int -d utf-8 $filename"; $a = exec($command, $result, $retval); */ // for XLS spreadsheets enter here } else { if ($source_type == 'xls') { $error = ''; require_once "" . $converter_dir . "/xls_reader.php"; $data = new Spreadsheet_Excel_Reader(); if ($mb == '1') { // if extention exists, change 'iconv' to mb_convert_encoding: $data->setUTFEncoder('mb'); } // set output encoding. $data->setOutputEncoding('UTF-8'); // read this document $data->read($filename); $error = $data->_ole->error; if ($error == '1') { printStandardReport('xlsError', $command_line, $no_log); $result = 'ERROR'; } else { $result = ''; $boundsheets = array(); $sheets = array(); $boundsheets = $data->boundsheets; // get all tables in this file $sheets = $data->sheets; // get content of all sheets in all tables if ($boundsheets) { foreach ($boundsheets as &$bs) { $result .= "" . $bs['name'] . ", "; // collect all table names in this file } if ($sheets) { foreach ($sheets as &$sheet) { $cells = $sheet['cells']; if ($cells) { // ignore all empty cells foreach ($cells as &$cell) { foreach ($cell as &$content) { $result .= "" . $content . ", "; // collect content of all cells } } } } } if (strtoupper($home_charset) == 'ISO-8859-1') { $result = utf8_encode($result); } } } // for ODS spreadsheets enter here } else { if ($source_type == 'ods') { require_once "" . $converter_dir . "/ods_reader.php"; $reader = ods_reader::reader($filename); $sheets = $reader->read($filename); if ($sheets) { $result = ''; foreach ($sheets as &$sheet) { if ($sheet) { foreach ($sheet as &$cell) { if ($cell) { // ignore all empty cells foreach ($cell as &$content) { $result .= "" . $content . " "; // collect content of all cells } } } } } } else { $result = 'ERROR'; } // for ODT documents enter here } else { if ($source_type == 'odt') { require_once "" . $converter_dir . "/odt_reader.php"; $x = new odt_reader(); // Unzip the document $u = $x->odt_unzip($filename, false); // read the document $result = $x->odt_read($u[0], 2); // create some blanks around the <div> tags $result = str_replace("<", " <", $result); $result = str_replace(">", "> ", $result); //echo "\r\n\r\n<br /> odt result: $result<br />\r\n"; // for DOCX files enter here } else { if ($source_type == 'docx') { // converter class supplied by http://www.phpdocx.com $options = array('paragraph' => false, 'list' => false, 'table' => false, 'footnote' => false, 'endnote' => false, 'chart' => 0); $docx_file = "docx.txt"; $result = ''; require_once "" . $converter_dir . "/docx/CreateDocx.inc"; CreateDocx::DOCX2TXT($filename, $tmp_dir . "/" . $docx_file, $options); if ($file = @file_get_contents($tmp_dir . "/" . $docx_file)) { $result = "{$file} "; } if ($index_xmeta) { require_once "" . $converter_dir . "/xmeta_converter.php"; $docxmeta = new x_metadata(); $docxmeta->setDocument($filename); /* echo "Title : " . $docxmeta->getTitle() . "<br>"; echo "Subject : " . $docxmeta->getSubject() . "<br>"; echo "Creator : " . $docxmeta->getCreator() . "<br>"; echo "Keywords : " . $docxmeta->getKeywords() . "<br>"; echo "Description : " . $docxmeta->getDescription() . "<br>"; echo "Last Modified By : " . $docxmeta->getLastModifiedBy() . "<br>"; echo "Revision : " . $docxmeta->getRevision() . "<br>"; echo "Date Created : " . $docxmeta->getDateCreated() . "<br>"; echo "Date Modified : " . $docxmeta->getDateModified() . "<br>"; */ $result .= $docxmeta->getTitle() . $docxmeta->getSubject() . $docxmeta->getCreator() . $docxmeta->getKeywords() . $docxmeta->getDescription() . $docxmeta->getLastModifiedBy() . $docxmeta->getRevision() . $docxmeta->getDateCreated() . $docxmeta->getDateModified(); } @unlink($tmp_dir . "/" . $docx_file); /* if($result && $chrSet != "UTF-8") { $result = @mb_convert_encoding($result, "UTF-8", $chrSet); } */ // for XLSX spreadsheets enter here } else { if ($source_type == 'xlsx') { $result = ''; $i = 1; $name = ''; $finished = false; $names = array(); require_once "" . $converter_dir . "/xlsx_reader.php"; $xlsx = new SimpleXLSX($filename); $names = $xlsx->sheetNames(); //echo "\r\n\r\n<br>names array:<br><pre>";print_r($names);echo "</pre>\r\n"; if ($debug == 2 && $names) { printXLSXreport(count($names), $cl); } foreach ($names as $my_name) { $result .= $my_name . " "; if ($debug == 2) { printActKeyword($my_name); } } while (!$finished) { // get all sheets if ($rows = $xlsx->rows($i)) { foreach ($rows as $key) { foreach ($key as $val) { if ($val) { $result .= " " . $val; // add value of each cell } } } } else { $finished = true; // no more sheets found } //$my_name = $xlsx->sheetName($i); //echo "\r\n\r\n<br /> sheet name $i: '$my_name'<br />\r\n"; $i++; // try to get next sheet } if ($index_xmeta) { require_once "" . $converter_dir . "/xmeta_converter.php"; $xlscxmeta = new x_metadata(); $xlscxmeta->setDocument($filename); /* echo "Title : " . $xlscxmeta->getTitle() . "<br>"; echo "Subject : " . $xlscxmeta->getSubject() . "<br>"; echo "Creator : " . $xlscxmeta->getCreator() . "<br>"; echo "Keywords : " . $xlscxmeta->getKeywords() . "<br>"; echo "Description : " . $xlscxmeta->getDescription() . "<br>"; echo "Last Modified By : " . $xlscxmeta->getLastModifiedBy() . "<br>"; echo "Revision : " . $xlscxmeta->getRevision() . "<br>"; echo "Date Created : " . $xlscxmeta->getDateCreated() . "<br>"; echo "Date Modified : " . $xlscxmeta->getDateModified() . "<br>"; */ $result .= $xlscxmeta->getTitle() . $xlscxmeta->getSubject() . $xlscxmeta->getCreator() . $xlscxmeta->getKeywords() . $xlscxmeta->getDescription() . $xlscxmeta->getLastModifiedBy() . $xlscxmeta->getRevision() . $xlscxmeta->getDateCreated() . $xlscxmeta->getDateModified(); } /* if($result && $chrSet != "UTF-8") { $result = @mb_convert_encoding($result, "UTF-8", $chrSet); } */ // for JavaScript enter here } else { if ($source_type == 'js') { $result = extract_js($file); } } } } } } } } } if ($result != 'ERROR') { if (is_array($result)) { $result = implode(" ", $result); } $count = strlen($result); if ($count == '0') { // if there was not one word found, print warning message if ($source_type == 'js') { printStandardReport('jsEmpty', $command_line, $no_log); } else { printStandardReport('nothingFound', $command_line, $no_log); } $result = 'ERROR'; } } unlink($filename); mysqltest(); if ($clear == 1) { unset($command, $retval, $a, $file, $count); } return $result; }