/**
  * Parses each compatible document found.
  *
  * @param     boolean     $direct_parsing     Proceeds parsing only when this parameter and admin's choice match.
  * @since     1.0.0
  */
 private function parse_documents($direct_parsing_hook)
 {
     $options = get_option(self::OPTIONS_KEY);
     $direct_parsing_option = $options[self::OPT_DIRECT_PARSING];
     if ($direct_parsing_option !== $direct_parsing_hook) {
         return;
     }
     $upload_dir = wp_upload_dir();
     $documents = $this->get_unparsed_documents();
     foreach ($documents as $document) {
         $filepath = $upload_dir['basedir'] . DIRECTORY_SEPARATOR . $document['filename'];
         $content = NULL;
         switch ($document['mime_type']) {
             case 'application/pdf':
                 $content = PdfParser::parse($filepath);
                 break;
             case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
                 $content = DocxParser::parse($filepath);
                 break;
             case 'application/vnd.oasis.opendocument.text':
                 $content = OdtParser::parse($filepath);
                 break;
             default:
                 break;
         }
         if (!$content) {
             continue;
         }
         // add content to postmeta
         $this->save_doc_contents($document['post_id'], $content);
     }
     // update last parsing date
     update_option(self::OPTIONS_LAST_UPDATE_KEY, gmdate('Y-m-d H:i:s'));
 }
Пример #2
0
/**
 * Given a path to a PDF document returns a lucene document with filename and contents set.
 * @param $path
 * @return Zend_Search_Lucene_Document
 */
function createPDFDocument($path)
{
    require_once 'PdfParser.php';
    $text = PdfParser::parseFile($path);
    $doc = new Zend_Search_Lucene_Document();
    $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
    $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', $text));
    return $doc;
}
Пример #3
0
//MS Doc + slsx for catdoc
// http://www.wagner.pp.ru/~vitus/software/catdoc/
// http://stackoverflow.com/questions/5671988/how-to-extract-just-plain-text-from-doc-docx-files-unix
//PDF
// https://gist.github.com/smalot/6183152
// pdftotext
$str = "";
if ($file->getType() == "doc") {
    $str = exec("catdoc '" . escapeshellcmd($filename) . "'");
}
if ($file->getType() == "docx") {
    $str = exec("unzip -p '" . escapeshellcmd($filename) . "' word/document.xml | sed -e 's/<\\/w:p>/\n/g; s/<[^>]\\{1,\\}>//g; s/[^[:print:]\n]\\{1,\\}//g'");
}
if ($file->getType() == "pdf") {
    include "pdfparser.php";
    $parser = new PdfParser();
    $str = $parser->parseFile($filename);
    $im = new imagick($filename);
    $im->setImageFormat('jpg');
    $imdata = base64_encode($im);
}
$textarr1 = preg_split("/\\s+/", $str);
$textarr = [];
foreach ($textarr1 as $b) {
    if (!in_array($b, $textarr)) {
        $textarr[$b] = 1;
    } else {
        $textarr[$b] += 1;
    }
}
$a = $file->originalname;
Пример #4
0
 public function docAction()
 {
     $this->_helper->layout->setLayout('layout-dms-uploader');
     //$this->_helper->layout->disableLayout();
     //$this->_helper->viewRenderer->setNoRender(TRUE);
     $request = $this->getRequest();
     // 		$source = ROOT_DIR . "/data/PP_NO_31_1995.DOC";
     // 		$source = ROOT_DIR . "/data/PERPRES_NO_77_2007.DOC";
     //\PhpOffice\PhpWord\Settings::loadConfig(CONFIG_PATH.'/phpword.ini');
     //$phpWord = \PhpOffice\PhpWord\IOFactory::load($source,"MsDoc");
     //echo $this->write($phpWord, basename(__FILE__, '.php'), ['HTML' => 'html']);
     // 		Pandamp_Debug::manager($this->read_doc($source));
     if ($request->isPost()) {
         $registry = Zend_Registry::getInstance();
         $files = $registry->get('files');
         $regType = ['peraturan pemerintah', 'peraturan presiden', 'undang-undang', 'peraturan menteri'];
         // 		$content = $this->parseWord($source);
         // 		$lines = file($source);
         // 		$text = strtolower($lines[1]);
         // 		$outtext = preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/","",$text);
         // 		if ( file_exists($source) ) {
         $pdf = new PdfParser();
         $string = $pdf->parseFile($files['uploadedFile1']['tmp_name']);
         $outtext = preg_replace("/[^a-zA-Z0-9\\s\\,\\.\\-\n\r\t@\\/\\_\\(\\)]/", "", $string);
         Pandamp_Debug::manager($outtext);
         if (($fh = fopen($files['uploadedFile1']['tmp_name'], 'r')) !== false) {
             $headers = fread($fh, 0xa00);
             $n1 = ord($headers[0x21c]) - 1;
             $n2 = (ord($headers[0x21d]) - 8) * 256;
             $n3 = ord($headers[0x21e]) * 256 * 256;
             $n4 = ord($headers[0x21f]) * 256 * 256 * 256;
             $textLength = $n1 + $n2 + $n3 + $n4;
             $extracted_plaintext = fread($fh, $textLength);
             //echo nl2br($extracted_plaintext);die;
             $c = nl2br($extracted_plaintext);
             //print_r($this->extract_emails_from($extracted_plaintext));die;
             $fp = fopen("php://memory", 'r+');
             fputs($fp, $c);
             rewind($fp);
             while ($line = fgets($fp)) {
                 /*foreach(preg_split("/((\r?\n)|(\r\n?))/", $line) as $l){
                 		    echo $l."\n";die;
                 		}*/
                 $arr = preg_split("/((\r?\n)|(\r\n?))/", $line);
                 //Pandamp_Debug::manager($arr);
                 $result = array_filter($regType, create_function('$e', 'return strstr("' . strtolower($arr[0]) . '", $e);'));
                 if ($result) {
                     $txt = array_values($result);
                     $txt = ucwords($txt[0]) . "\n";
                 }
                 $txt .= $arr[1];
             }
             fclose($fp);
             $this->view->assign('test', $txt);
         }
         // 		}
     }
 }
Пример #5
0
    if ($res === true) {
        @$zip->extractTo('uploads/ffcp/');
        $zip->close();
    }
}
$files = glob("uploads/ffcp/*");
foreach ($files as $file) {
    //echo $file;
    include_once 'PDF2TEXT.php';
    include_once 'DOCTOTEXT.php';
    $a = new SplFileInfo($file);
    $extension = $a->getExtension();
    //echo $extension;
    if ($extension == 'pdf') {
        //echo "asdsad";
        $a = new PdfParser();
        $data = $a->parseFile($file);
        $data = preg_replace('/[^A-Za-z0-9]/', '', $data);
        //echo $data;
        $handle = fopen("one.txt", 'w');
        fwrite($handle, $data);
        $fp->fingerPrint($file);
    } else {
        if ($extension == 'docx') {
            $docobj = new DocxConversion($file);
            $data = $docobj->convertToText();
            $data = preg_replace('/[^A-Za-z0-9]/', '', $data);
            $handle = fopen("one.txt", 'w');
            fwrite($handle, $data);
            $fp->fingerPrint($file);
        } else {