/** * Parses each compatible document found. * * @param boolean $direct_parsing Proceeds parsing only when this parameter and admin's choice match. * @since 1.0.0 */ private function parse_documents($direct_parsing_hook) { $options = get_option(self::OPTIONS_KEY); $direct_parsing_option = $options[self::OPT_DIRECT_PARSING]; if ($direct_parsing_option !== $direct_parsing_hook) { return; } $upload_dir = wp_upload_dir(); $documents = $this->get_unparsed_documents(); foreach ($documents as $document) { $filepath = $upload_dir['basedir'] . DIRECTORY_SEPARATOR . $document['filename']; $content = NULL; switch ($document['mime_type']) { case 'application/pdf': $content = PdfParser::parse($filepath); break; case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': $content = DocxParser::parse($filepath); break; case 'application/vnd.oasis.opendocument.text': $content = OdtParser::parse($filepath); break; default: break; } if (!$content) { continue; } // add content to postmeta $this->save_doc_contents($document['post_id'], $content); } // update last parsing date update_option(self::OPTIONS_LAST_UPDATE_KEY, gmdate('Y-m-d H:i:s')); }
/** * Given a path to a PDF document returns a lucene document with filename and contents set. * @param $path * @return Zend_Search_Lucene_Document */ function createPDFDocument($path) { require_once 'PdfParser.php'; $text = PdfParser::parseFile($path); $doc = new Zend_Search_Lucene_Document(); $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path))); $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', $text)); return $doc; }
//MS Doc + slsx for catdoc // http://www.wagner.pp.ru/~vitus/software/catdoc/ // http://stackoverflow.com/questions/5671988/how-to-extract-just-plain-text-from-doc-docx-files-unix //PDF // https://gist.github.com/smalot/6183152 // pdftotext $str = ""; if ($file->getType() == "doc") { $str = exec("catdoc '" . escapeshellcmd($filename) . "'"); } if ($file->getType() == "docx") { $str = exec("unzip -p '" . escapeshellcmd($filename) . "' word/document.xml | sed -e 's/<\\/w:p>/\n/g; s/<[^>]\\{1,\\}>//g; s/[^[:print:]\n]\\{1,\\}//g'"); } if ($file->getType() == "pdf") { include "pdfparser.php"; $parser = new PdfParser(); $str = $parser->parseFile($filename); $im = new imagick($filename); $im->setImageFormat('jpg'); $imdata = base64_encode($im); } $textarr1 = preg_split("/\\s+/", $str); $textarr = []; foreach ($textarr1 as $b) { if (!in_array($b, $textarr)) { $textarr[$b] = 1; } else { $textarr[$b] += 1; } } $a = $file->originalname;
public function docAction() { $this->_helper->layout->setLayout('layout-dms-uploader'); //$this->_helper->layout->disableLayout(); //$this->_helper->viewRenderer->setNoRender(TRUE); $request = $this->getRequest(); // $source = ROOT_DIR . "/data/PP_NO_31_1995.DOC"; // $source = ROOT_DIR . "/data/PERPRES_NO_77_2007.DOC"; //\PhpOffice\PhpWord\Settings::loadConfig(CONFIG_PATH.'/phpword.ini'); //$phpWord = \PhpOffice\PhpWord\IOFactory::load($source,"MsDoc"); //echo $this->write($phpWord, basename(__FILE__, '.php'), ['HTML' => 'html']); // Pandamp_Debug::manager($this->read_doc($source)); if ($request->isPost()) { $registry = Zend_Registry::getInstance(); $files = $registry->get('files'); $regType = ['peraturan pemerintah', 'peraturan presiden', 'undang-undang', 'peraturan menteri']; // $content = $this->parseWord($source); // $lines = file($source); // $text = strtolower($lines[1]); // $outtext = preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/","",$text); // if ( file_exists($source) ) { $pdf = new PdfParser(); $string = $pdf->parseFile($files['uploadedFile1']['tmp_name']); $outtext = preg_replace("/[^a-zA-Z0-9\\s\\,\\.\\-\n\r\t@\\/\\_\\(\\)]/", "", $string); Pandamp_Debug::manager($outtext); if (($fh = fopen($files['uploadedFile1']['tmp_name'], 'r')) !== false) { $headers = fread($fh, 0xa00); $n1 = ord($headers[0x21c]) - 1; $n2 = (ord($headers[0x21d]) - 8) * 256; $n3 = ord($headers[0x21e]) * 256 * 256; $n4 = ord($headers[0x21f]) * 256 * 256 * 256; $textLength = $n1 + $n2 + $n3 + $n4; $extracted_plaintext = fread($fh, $textLength); //echo nl2br($extracted_plaintext);die; $c = nl2br($extracted_plaintext); //print_r($this->extract_emails_from($extracted_plaintext));die; $fp = fopen("php://memory", 'r+'); fputs($fp, $c); rewind($fp); while ($line = fgets($fp)) { /*foreach(preg_split("/((\r?\n)|(\r\n?))/", $line) as $l){ echo $l."\n";die; }*/ $arr = preg_split("/((\r?\n)|(\r\n?))/", $line); //Pandamp_Debug::manager($arr); $result = array_filter($regType, create_function('$e', 'return strstr("' . strtolower($arr[0]) . '", $e);')); if ($result) { $txt = array_values($result); $txt = ucwords($txt[0]) . "\n"; } $txt .= $arr[1]; } fclose($fp); $this->view->assign('test', $txt); } // } } }
if ($res === true) { @$zip->extractTo('uploads/ffcp/'); $zip->close(); } } $files = glob("uploads/ffcp/*"); foreach ($files as $file) { //echo $file; include_once 'PDF2TEXT.php'; include_once 'DOCTOTEXT.php'; $a = new SplFileInfo($file); $extension = $a->getExtension(); //echo $extension; if ($extension == 'pdf') { //echo "asdsad"; $a = new PdfParser(); $data = $a->parseFile($file); $data = preg_replace('/[^A-Za-z0-9]/', '', $data); //echo $data; $handle = fopen("one.txt", 'w'); fwrite($handle, $data); $fp->fingerPrint($file); } else { if ($extension == 'docx') { $docobj = new DocxConversion($file); $data = $docobj->convertToText(); $data = preg_replace('/[^A-Za-z0-9]/', '', $data); $handle = fopen("one.txt", 'w'); fwrite($handle, $data); $fp->fingerPrint($file); } else {