public function readBinary() { $parser = new \Smalot\PdfParser\Parser(); try { $result = $parser->parseContent($this->binary); $text = $result->getText(); $text = str_replace(["\n\n"], "", $text); } catch (Exception $exception) { $text = NULL; } $text = empty($text) ? 'No readable text. File size: ' . strlen($this->binary) . 'B' : $text; return '<pre>' . $text . '</pre>'; }
if (!empty($file)) { $moved = move_uploaded_file($file['tmp_name'], dirname(__FILE__) . '/../uploads/' . sha1(time()) . "-" . $file['name']); if ($moved) { return new Response(json_encode(array('message' => 'Upload Successful!')), '200'); } else { return new Response(json_encode(array('message' => 'File upload error!')), '500'); } } }); $app->get('pages/{id}', function (Silex\Application $app, $id) { // Add a parameter for an ID in the route, and it will be supplied as argument in the function if (!array_key_exists($id, $app['files'])) { $app->abort(404, 'The PDF file could not be found'); } $file = $app['files'][$id]; $parser = new \Smalot\PdfParser\Parser(); $filepath = dirname(__FILE__) . '/../uploads/' . $file; $document = $parser->parseFile($filepath); $details = $document->getDetails(); $dir = dirname(__FILE__) . '/../uploads/pages/' . $id; if (!file_exists($dir)) { $dirCreate = mkdir($dir); for ($i = 1; $i <= $details['Pages']; $i++) { $fpdi = new FPDI(); $fpdi->setSourceFile($filepath); $tpl = $fpdi->importPage($i); $size = $fpdi->getTemplateSize($tpl); $orientation = $size['h'] > $size['w'] ? 'P' : 'L'; $fpdi->AddPage($orientation); $fpdi->useTemplate($tpl, null, null, $size['w'], $size['h'], true); try {
public static function parse($filename) { if (!$filename || !file_exists($filename)) { return false; } // Parse pdf file and build necessary objects. $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile($filename); // Retrieve all pages from the pdf file. $pages = $pdf->getPages(); // Loop over each page to extract text. $text = ""; foreach ($pages as $page) { $text .= $page->getText(); } return $text; }
function fromFile($filename) { $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile($filename); $text = $pdf->getText(); $linhas = explode("\n", $text); foreach ($linhas as $linha) { if (strlen($linha) == 61) { $this->linha = str_replace(" ", "", $linha); break; } } return $this->parseLinha(); }
public function testGetText() { // Document with text. $filename = __DIR__ . '/../../../../../samples/Document1_pdfcreator_nocompressed.pdf'; $parser = new \Smalot\PdfParser\Parser(); $document = $parser->parseFile($filename); $pages = $document->getPages(); $page = $pages[0]; $text = $page->getText(); // var_dump($text); $this->assert->string($text)->hasLengthGreaterThan(150); $this->assert->string($text)->contains('Document title'); $this->assert->string($text)->contains('Lorem ipsum'); $this->assert->string($text)->contains('Calibri'); $this->assert->string($text)->contains('Arial'); $this->assert->string($text)->contains('Times'); $this->assert->string($text)->contains('Courier New'); $this->assert->string($text)->contains('Verdana'); }
function readDocument($path){ $conf =& JFactory::getConfig(); $tmp_path = $conf->getValue('config.tmp_path'); $acceptableFiles = array( 'application/msword' => "doc",//doc 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => "docx",//docx, 'application/pdf' => "pdf" //pdf ); $finfo = finfo_open(FILEINFO_MIME_TYPE); $file_extension = finfo_file($finfo, $path); finfo_close($finfo); if(array_key_exists($file_extension, $acceptableFiles)){ if($acceptableFiles[$file_extension] == "doc" || $acceptableFiles[$file_extension] == "docx"){ require_once 'libraries/PHPWord-master/src/PhpWord/Autoloader.php'; \PhpOffice\PhpWord\Autoloader::register(); $phpWord = \PhpOffice\PhpWord\IOFactory::load($path); $result = $this->write($phpWord, "temp", array('HTML' => 'html'),$tmp_path); $contents = file_get_contents($tmp_path.'/temp.html', true); }else if($acceptableFiles[$file_extension] == "pdf"){ include 'libraries/pdfparser/vendor/autoload.php'; $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile($path); $contents = $pdf->getText(); } } return $contents; }
public function parsePostPDF() { $validator = Validator::make(Input::all(), array('packinglist' => 'required')); if (!$validator->fails()) { //validation passes $file = Input::file('packinglist'); $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile($file); $pages = $pdf->getPages(); $arrayOfPos = array(); foreach ($pages as $page) { $text = nl2br($page->getText()); $tempPDF = explode('<br />', $text); $getPO = explode(':', $tempPDF[10]); $PO = trim($getPO[1]); array_push($arrayOfPos, $PO); } $totalOfPos = count($arrayOfPos); $queryString = $this->joinKohlsParsePO($arrayOfPos); // $data['POs'] = $arrayOfPos; $returnPOString = ''; foreach ($arrayOfPos as $returnPO) { $returnPOString .= $returnPO . '<br>'; } $data['POs'] = $returnPOString; $data['totalOfPOs'] = $totalOfPos; $data['queryString'] = $queryString; return View::make('parsepdf-output', $data); } else { //validation fails return View::make('parsepdf-input')->with(array('response' => '<p style="color:red;">Please select a packing list pdf to parse.</p>')); } }
public function extractPdf() { //Parse pdf file and build necessary objects. $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile($this->submittedCV); $text = $pdf->getText(); return $text; }
protected function pdfToString($sourcefile) { // Parse pdf file and build necessary objects. $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile($sourcefile); $text = $pdf->getText(); $text = str_replace(array('&', '%', '$'), ' ', $text); return $text; }
private function getDocument() { if (empty($this->document)) { $parser = new \Smalot\PdfParser\Parser(); try { $this->document = $parser->parseFile($this->filename); } catch (\Exception $ex) { throw new ParseException($ex); } } return $this->document; }
/** * This method parses the given document and extract its fingerprints * encapsulating the result in a Document object. * * @private * @param $documentName the name given by the user to the file * @param $documentPath the temporary path that the document is being stored * @return {Document} */ private function parseDocument($documentName, $documentPath) { $pdfParser = new \Smalot\PdfParser\Parser(); $parsedPDF = $pdfParser->parseFile($documentPath); $text = $parsedPDF->getText(); $fingerprints = $this->documentService->extractFingerprint($text); $doc = new Document(); $doc->setName($documentName); //preg_replace('/[^a-zA-Z0-9]/', '', $text) $doc->setContent($text); $doc->setFingerprints($fingerprints); return $doc; }
protected static function addToPDFSearchIndex($strFile, $arrParentSet) { $objFile = new \File($strFile); if (!Validator::isValidPDF($objFile)) { return false; } $objDatabase = \Database::getInstance(); $objModel = $objFile->getModel(); $arrMeta = \Frontend::getMetaData($objModel->meta, $arrParentSet['language']); // Use the file name as title if none is given if ($arrMeta['title'] == '') { $arrMeta['title'] = specialchars($objFile->basename); } $arrSet = array('pid' => $arrParentSet['pid'], 'tstamp' => time(), 'title' => $arrMeta['title'], 'url' => $objFile->value, 'filesize' => \System::getReadableSize($objFile->size, 2), 'checksum' => $objFile->hash, 'protected' => $arrParentSet['protected'], 'groups' => $arrParentSet['groups'], 'language' => $arrParentSet['language'], 'mime' => $objFile->mime); // Return if the file is indexed and up to date $objIndex = $objDatabase->prepare("SELECT * FROM tl_search WHERE url=? AND checksum=?")->execute($arrSet['url'], $arrSet['checksum']); // there are already indexed files containing this file (same checksum and filename) if ($objIndex->numRows) { // Return if the page with the file is indexed if (in_array($arrSet['pid'], $objIndex->fetchEach('pid'))) { return false; } $strContent = $objIndex->text; } else { try { // parse only for the first occurrence $parser = new \Smalot\PdfParser\Parser(); $objPDF = $parser->parseFile($strFile); $strContent = $objPDF->getText(); } catch (\Exception $e) { // Missing object refernce #... return false; } } // Put everything together $arrSet['text'] = $strContent; $arrSet['text'] = trim(preg_replace('/ +/', ' ', \String::decodeEntities($arrSet['text']))); // Update an existing old entry if ($objIndex->pid == $arrSet['pid']) { $objDatabase->prepare("UPDATE tl_search %s WHERE id=?")->set($arrSet)->execute($objIndex->id); $intInsertId = $objIndex->id; } else { $objInsertStmt = $objDatabase->prepare("INSERT INTO tl_search %s")->set($arrSet)->execute(); $intInsertId = $objInsertStmt->insertId; } static::indexContent($arrSet, $intInsertId); }
/** * Tests that the 'printable/pdf/node/{node}' path returns the right content. */ public function testCustomPageExists() { global $base_url; $node_type_storage = \Drupal::entityManager()->getStorage('node_type'); // Test /node/add page with only one content type. $node_type_storage->load('article')->delete(); $this->drupalGet('node/add'); $this->assertResponse(200); $this->assertUrl('node/add/page'); // Create a node. $edit = array(); $edit['title[0][value]'] = $this->randomMachineName(8); $bodytext = $this->randomMachineName(16) . 'This is functional test which I am writing for printable module.'; $edit['body[0][value]'] = $bodytext; $this->drupalPostForm('node/add/page', $edit, t('Save')); // Check that the Basic page has been created. $this->assertRaw(t('!post %title has been created.', array('!post' => 'Basic page', '%title' => $edit['title[0][value]'])), 'Basic page created.'); // Check that the node exists in the database. $node = $this->drupalGetNodeByTitle($edit['title[0][value]']); $this->assertTrue($node, 'Node found in database.'); // Verify that pages do not show submitted information by default. $this->drupalGet('node/' . $node->id()); $this->assertResponse(200); // Set the PDF generating tool. $this->drupalGet('admin/config/user-interface/printable/pdf'); $this->drupalPostForm(NULL, array('print_pdf_pdf_tool' => 'mPDF', 'print_pdf_content_disposition' => 1, 'print_pdf_filename' => 'modules/custom/printable/src/Tests/testPDF'), t('Submit')); $this->drupalGet('admin/config/user-interface/printable/pdf'); $this->assertResponse(200); // Test whether PDF page is being generated. $this->drupalGet('printable/pdf/node/' . $node->id()); $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile('modules/custom/printable/src/Tests/testPDF.pdf'); $text = $pdf->getText(); $this->drupalGet('node/add'); $new_edit = array(); $new_edit['title[0][value]'] = $this->randomMachineName(8); $bodytext = $text; $new_edit['body[0][value]'] = $bodytext; $this->drupalPostForm('node/add/page', $new_edit, t('Save')); $new_node = $this->drupalGetNodeByTitle($new_edit['title[0][value]']); $this->drupalGet('node/' . $new_node->id()); $this->assertResponse(200); // Checks the presence of body in the page. $this->assertRaw($edit['body[0][value]'], 'Body discovered successfully in the printable page'); // Check if footer is rendering correctly. $this->assertRaw($base_url . '/node/' . $node->id(), 'Source Url discovered in the printable page'); }
public function actionImport() { require 'vendor/autoload.php'; $db = new MongoClient("mongodb://localhost:27017"); $grid = $db->selectDB('xpps')->getGridFS(); $parser = new \Smalot\PdfParser\Parser(); $dir = new RecursiveDirectoryIterator('C:\\tmp\\klett-cotta\\daten\\www.traumaundgewalt.de'); foreach (new RecursiveIteratorIterator($dir) as $file) { if (!is_dir($file) and $file->getExtension() == "pdf") { $pdf = $parser->parseFile($file); $metas = $pdf->getDetails(); echo basename($file) . " path: " . realpath($file) . "<br>"; #echo var_dump((string)$file)."<br>"; #$grid->storeFile((string)$file, array('metadata'=>$metas)); } } }
function pdfToString() { $links = crawl_page("http://www.betriebsrestaurant-gmbh.de/index.php?id=91"); $pdfLink = ""; foreach ($links as $file) { if (strpos(strtolower($file), '.pdf') !== FALSE && strpos($file, '_FMI_') !== FALSE) { $weekNumber = date("W"); if ($weekNumber === substr($file, 16, 2)) { // current link is MI pdf $pdfLink = "http://www.betriebsrestaurant-gmbh.de/" . $file; } } } // Parse pdf file and build necessary objects. $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile($pdfLink); $text = $pdf->getText(); return $text; }
public function testParseFile() { $directory = getcwd() . '/samples/bugs'; if (is_dir($directory)) { $files = scandir($directory); $parser = new \Smalot\PdfParser\Parser(); foreach ($files as $file) { if (preg_match('/^.*\\.pdf$/i', $file)) { try { $document = $parser->parseFile($directory . '/' . $file); $pages = $document->getPages(); $page = $pages[0]; $content = $page->getText(); $this->assert->string($content); } catch (\Exception $e) { if ($e->getMessage() != 'Secured pdf file are currently not supported.' && strpos($e->getMessage(), 'TCPDF_PARSER') != 0) { throw $e; } } } } } }
public function crawl() { if ($this->hasError()) { return; } if (!$this->isLoggedIn()) { $this->setError(true); return; } if (!$this->downloadList()) { return; } $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile(path . 'saved/teacher/teacher.pdf'); $content = explode(PHP_EOL, $pdf->getText()); array_splice($content, 0, 1); $output = array(); foreach ($content as $line) { $parts = explode(' ', $line); $teacher = ''; $list = array(); foreach ($parts as $part) { if (ctype_space($part) || $part == '') { continue; } if (strlen($part) > 3 || ctype_lower($part) || strpos($part, '.') !== false) { $teacher .= $part . ' '; continue; } array_push($list, $part); } $teacher = rtrim($teacher); $output[$teacher] = $list; } $this->output = $output; }
/** * Display a listing of the resource. * * @return \Illuminate\Http\Response */ public function StartExtractPdf(Request $request, $pdfid, $retailername) { $pdfs = \DB::table('pdf')->where('id', '=', $pdfid)->get(); foreach ($pdfs as $pdf_url) { $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile($pdf_url->pricelist_file); //method called from Parser.php // $text = $pdf->getSectionsText(); $text = $pdf->getText(); //method called from Object.php $toReplace = array('&', ',', '"', '\\r\\n', 'Price', 'Q uad', 'M icro', 'M aximus', 'D D R3'); $with = array('-', '_', 'inch', ' ', 'Price ', 'Quad', 'Micro', 'Maximus', 'DDR3'); $string = str_replace($toReplace, $with, $text); $new = trim(preg_replace('/\\n/', ' ', $string)); $filename = strtolower($retailername) . "-pricelist-pdf.txt"; $myfile = fopen(public_path() . "/file/" . $filename, "w") or die("Unable to open file!"); fwrite($myfile, $new); fclose($myfile); if ($myfile) { return '<div class="alert alert-success">successfully extract data from pdf</div>'; } } return '<div class="alert alert-danger">failed to extract data from pdf</div>'; }
/** * @uses processOuterBorders * @uses processGridLine * @uses processText * @uses processHiddenClue * @throws \Exception */ private function parseRawData() { $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseContent($this->rawData); $pdfObjects = $pdf->getObjects(); foreach ($pdfObjects as $key => $object) { $content = $object->getContent(); if ('' === $content) { continue; } foreach (self::$knownContentTypes as $method => $regex) { if (1 === preg_match($regex, $content, $matches)) { $this->{$method}($matches); break; } } } $this->fillGaps(); $this->labels = $this->labelFactory->getFromRaw($this->labelsRaw); $this->resetTempProperties(); }
private function &read_pdf() { $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile($this->filename); $text = $pdf->getText(); return $text; }
function getArrayOfPOs($file) { $returnArray = array(); $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile($file); $pages = $pdf->getPages(); foreach ($pages as $page) { $text = nl2br($page->getText()); $tempPDF = explode('<br />', $text); $getPO = explode(':', $tempPDF[10]); $data['PO'] = trim($getPO[1]); $isGround = $this->checkIfGround($text); if ($isGround) { $data['shipterms'] = "Ground"; } else { $data['shipterms'] = "Not Ground"; } // $PO = trim($getPO[1]); array_push($returnArray, $data); } //dd($returnArray); return $returnArray; }
function getArrayOfPOs($file) { $returnArray = array(); $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile($file); $pages = $pdf->getPages(); foreach ($pages as $page) { $text = nl2br($page->getText()); $tempPDF = explode('<br />', $text); // dd($tempPDF); foreach ($tempPDF as $tempPDFArrayIndex) { // dd($tempPDFArrayIndex); if ($this->checkIfArrayIndexContainsOrder($tempPDFArrayIndex)) { // array index contains order # // echo 'temppdfvalue-iftrue:'.$tempPDFArrayIndex.'<br>'; $getPO = explode(':', $tempPDFArrayIndex); //dd($getPO); } else { //array index does not contain order # ..keep trying // echo 'temppdfvalue-iffalse:'.$tempPDFArrayIndex.'<br>'; } } // $getPO = explode(':', $tempPDF[6]); // dd($getPO[1]); $data['PO'] = trim($getPO[1]); $isGround = $this->checkIfGround($text); if ($isGround) { $data['shipterms'] = "Ground"; } else { $data['shipterms'] = "Not Ground"; } // $PO = trim($getPO[1]); array_push($returnArray, $data); } //dd($returnArray); return $returnArray; }
<?php // Include Composer autoloader if not already done. include 'vendor/autoload.php'; // Filename $filename = isset($argv[1]) ? $argv[1] : 'Clipping_Eletronico_ ABC_Grande_Sao_Paulo_e_Llitoral.pdf'; // Parse pdf file and build necessary objects. $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile($filename); // Retrieve all details from the pdf file. $details = $pdf->getDetails(); echo "Metadata <br/>"; foreach ($details as $property => $value) { if (is_array($value)) { $value = implode(', ', $value); } echo $property . ' => ' . $value . "<br/>"; } echo "\nTexto:<br/>"; $text = $pdf->getText(); echo $text; echo '<br/>Texto procurado: '; if (strpos($text, 'Assessoria') !== FALSE) { echo 'Encontrado'; } else { echo "Não encontrado"; }
public function testResolveXRef() { // Document with text. $filename = __DIR__ . '/../../../../../../samples/Document1_pdfcreator_nocompressed.pdf'; $parser = new \Smalot\PdfParser\Parser(); $document = $parser->parseFile($filename); $object = $document->getObjectById('3_0'); $kids = $object->get('Kids'); $this->assert->object($kids)->isInstanceOf('\\Smalot\\PdfParser\\Element\\ElementArray'); $this->assert->array($kids->getContent())->hasSize(1); $pages = $kids->getContent(); $this->assert->object(reset($pages))->isInstanceOf('\\Smalot\\PdfParser\\Page'); }
<?php include 'vendor/autoload.php'; $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile('pdf/test2.pdf'); $pages = $pdf->getPages(); function testFonts($fonts) { foreach ($fonts as $key => $font) { echo "Key: " . $key . " Font: " . $font->getName() . " Type: " . $font->getType() . " Details: "; echo $font->getContent(); echo "\n"; } } function testTextRelationToFont($page) { } foreach ($pages as $page) { // var_dump($page->getDocument()); echo $page->getUniqueId(); // echo $page->getText(); // $fonts = $page->getFonts(); // var_dump($page->getDetails()); // var_dump($fonts); }
<?php header('Content-Type: text/html; charset=UTF-8'); include 'vendor/autoload.php'; $message = ''; $texts = array(); if ($_SERVER['REQUEST_METHOD'] == 'POST') { try { $content = ''; if (isset($_POST['inputUrl']) && preg_match('/^https?:\\/\\//', trim($_POST['inputUrl']))) { $content = file_get_contents(trim($_POST['inputUrl'])); } elseif (isset($_FILES['inputFile']) && $_FILES['inputFile']['type'] == 'application/pdf') { $content = file_get_contents($_FILES['inputFile']['tmp_name']); } if ($content) { $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseContent($content); $pages = $pdf->getPages(); foreach ($pages as $page) { $texts[] = $page->getText(); } } else { throw new Exception('Unable to retrieve content. Check if it is really a pdf file.'); } } catch (Exception $e) { $message = $e->getMessage(); } } ?> <!DOCTYPE html> <html>
} else { if (strpos($filename, '.xls') !== false || strpos($filename, '.XLS') !== false) { $reader = PHPExcel_IOFactory::load(PROJ_ROOT . '/upload/' . $filename); $sheetData = $reader->getActiveSheet()->toArray(null, true, true, false); } else { if (stripos($filename, '.pdf') !== false) { if (Utils::getValue('template') == 'nestle' || Utils::getValue('template') == 'affinity') { $sheetData = Utils::getRemotePDFtoText(PROJ_ROOT . '/upload/' . $filename); } else { if (Utils::getValue('template') == 'claber') { $sheetData = Utils::getRemotePDFtoText(PROJ_ROOT . '/upload/' . $filename); } else { if (Utils::getValue('template') == 'monge') { $sheetData = Utils::getRemotePDFtoText(PROJ_ROOT . '/upload/' . $filename); } else { $reader = new \Smalot\PdfParser\Parser(); $pdf = $reader->parseFile('upload/' . $filename); $sheetData = $pdf->getPages(); } } } } else { if (strpos($filename, '.txt') !== false || strpos($filename, '.TXT') !== false) { $sheetData = file_get_contents('upload/' . $filename); } } } if (Utils::getValue('template') && !empty(Utils::getValue('template'))) { $className = ucfirst(Utils::getValue('template')) . 'Xml'; if (class_exists($className)) { if (Utils::getValue('template') == 'amazon') {
function pdfToString() { $weekNumber = date('W'); //Check if we have the current week in cache $text = apc_fetch('hungertext' . $weekNumber); if ($text !== false) { return $text; } //Otherwise fetch all links $links = crawl_page(URL_PAGE_WITH_LINKS); $pdfLink = ''; foreach ($links as $file) { if (strpos(strtolower($file), '.pdf') !== FALSE && strpos($file, '_FMI_') !== FALSE && $weekNumber === substr($file, 16, 2)) { $pdfLink = URL_MAIN . $file; } } //Don't proceed when no link was found if (empty($pdfLink)) { return; } // Parse pdf file and build necessary objects. $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile($pdfLink); $text = $pdf->getText(); //Store it in cache apc_store('hungertext' . $weekNumber, $text, 2 * 24 * 3600); //return it return $text; }
/** * Execute the console command. * * @return mixed */ public function handle() { // parse url $url = $this->parseURLPDF(); // Parse pdf file and build necessary objects. $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile($url); $pages = $pdf->getPages(); foreach ($pages as $page) { $p = new ParsePage($page->getArray()); $products = $p->parseProducts(); foreach ($products as $product) { $p = Product::whereCode($product['code'])->first(); if (is_null($p)) { $p = new Product(); } $p->fill($product); $p->save(); } } }