/** * Object constructor * * @param string $data * @param boolean $storeContent * @throws NotIndexedException */ private function __construct($data, $storeContent) { //TODO check PDF >1.5 metadata extraction //do the content extraction $parser = new Parser(); try { $pdf = $parser->parseContent($data); $body = $pdf->getText(); // Store contents if ($storeContent) { $this->addField(Document\Field::Text('body', $body, 'UTF-8')); } else { $this->addField(Document\Field::UnStored('body', $body, 'UTF-8')); } $details = $pdf->getDetails(); // Store meta data properties foreach ($details as $key => $value) { $key = strtolower($key); if ($key === 'author') { $key = 'creator'; } $this->addField(Document\Field::Text($key, $value, 'UTF-8')); } } catch (\Exception $ex) { throw new NotIndexedException(null, null, $ex); } }
/** * @When I parse the pdf content */ public function iParseThePdfContent() { $parser = new Parser(); $pdf = $parser->parseFile($this->filename); $pages = $pdf->getPages(); $this->metadata = $pdf->getDetails(); foreach ($pages as $i => $page) { $this->pages[++$i] = $page->getText(); } }
/** * Handle action related to mime type detection. * These action can be exclude or link to handle custom content (like image, video, pdf, etc ..). * * @param array $mimeInfo From getMimeActionInfo() function * @param string $effectiveUrl Current content url * @param string $body Content from the response * * @return array|null */ private function handleMimeAction($mimeInfo, $effectiveUrl, $body = '') { if (!isset($mimeInfo['action'])) { return; } $infos = array('status' => 200, 'title' => $mimeInfo['name'], 'language' => '', 'html' => '', 'url' => $effectiveUrl, 'content_type' => $mimeInfo['mime'], 'open_graph' => array()); switch ($mimeInfo['action']) { case 'exclude': throw new \Exception(sprintf('This is url "%s" is blocked by mime action.', $effectiveUrl)); case 'link': $infos['html'] = '<a href="' . $effectiveUrl . '">Download ' . $mimeInfo['name'] . '</a>'; if ($mimeInfo['type'] == 'image') { $infos['html'] = '<a href="' . $effectiveUrl . '"><img src="' . $effectiveUrl . '" alt="' . $mimeInfo['name'] . '" /></a>'; } if ($mimeInfo['mime'] == 'application/pdf') { $parser = new PdfParser(); $pdf = $parser->parseFile($effectiveUrl); // tiny hack to avoid character like � $html = mb_convert_encoding(nl2br($pdf->getText()), 'UTF-8', 'UTF-8'); // strip away unwanted chars (that usualy came from PDF extracted content) // @see http://www.phpwact.org/php/i18n/charsets#common_problem_areas_with_utf-8 $html = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $html); $infos['html'] = $html; // update title in case of details are present $details = $pdf->getDetails(); // Title can be a string or an array with one key if (isset($details['Title'])) { if (is_array($details['Title']) && isset($details['Title'][0]) && '' !== trim($details['Title'][0])) { $infos['title'] = $details['Title'][0]; } elseif (is_string($details['Title']) && '' !== trim($details['Title'])) { $infos['title'] = $details['Title']; } } } if ($mimeInfo['mime'] == 'text/plain') { $infos['html'] = '<pre>' . $body . '</pre>'; } return $infos; } return; }
public static function analyze($pdf_data) { $parser = new PdfParser\Parser(); return new self($parser->parseContent($pdf_data)); }
public function judge(Request $request) { $fileName = uniqid(); //判断请求中是否存在文件,如果存在则保存文件,否则存储为null if ($request->hasFile('document')) { $path1 = './'; $request->file('document')->move($path1, $fileName . '.pdf'); } $parser = new Parser(); $pdf = $parser->parseFile($fileName . '.pdf'); $text = $pdf->getText(); $a = strpos($text, '作品名称'); $b = strpos($text, '学校全称'); $c = strpos($text, '申报者姓名'); $d = strpos($text, '类别'); $e = strpos($text, '■'); $f = strpos($text, '自然科学类学术论文'); $g = strpos($text, '哲学社会科学类社会调查报告和学术论文'); $h = strpos($text, '科技发明制作Α类'); $i = strpos($text, '科技发明制作Β类'); $j = strpos($text, '作品撰写的目的和基本思路'); $k = strpos($text, '作 品 的 科 学性、先进性及独特之处'); $l = strpos($text, '作品的实际应用价值'); //作品名称 $workName = substr($text, $a, $b - $a); $workName = str_replace(array('作品名称:', ' '), '', $workName); //学校名称 $schoolName = substr($text, $b, $c - $b); $schoolName = str_replace(array('学校全称:', ' '), '', $schoolName); //申报人姓名 $peopleName = substr($text, $c, $d - $c); $peopleName = str_replace(array('申报者姓名 (集体名称):', '申报者姓名:', ' '), '', $peopleName); //作品目的 $aim = substr($text, $j, $k - $j); $aim = str_replace(array('作品撰写的目的和基本思路', ' '), '', $aim); // echo $aim; //作品的科学性、先进性及独特支出 $brief = substr($text, $k, $l - $k); $brief = str_replace(array('作品的科学性、先进性及独特之处', ' ', '作品的科学性、先进性及独特之处'), '', $brief); // echo $brief; // echo $workName; // echo $schoolName; // echo $peopleName; $lenth[0] = $f - $e; $lenth[1] = $g - $e; $lenth[2] = $h - $e; $lenth[3] = $i - $e; $class = ''; if ($lenth[0] === 4) { $class = '自然科学类学术论文'; } if ($lenth[1] == 4) { $class = '哲学社会科学类社会调查报告和学术论文'; } if ($lenth[2] == 4) { $class = '科技发明制作Α类'; } if ($lenth[3] == 4) { $class = '科技发明制作Β类'; } // echo $class; $info['name'] = $workName; $info['school'] = $schoolName; $info['people'] = $peopleName; $info['big_class'] = $class; $info['aim'] = $aim; $info['brief'] = $brief; $info['document_name'] = $fileName; PdfWork::create($info); return view('pages.pdfWorkInfo', compact('info')); }
$pdfFileLink = $linkNode->attributes['href']->value; if ($pdfFileLink === '' || stripos($pdfFileLink, '.pdf') === false) { throw new Exception('No RUNLOG PDF link found from the web page!'); } $fileName = basename($pdfFileLink); if ($fileName !== '' && $fileName === trim(file_get_contents(__DIR__ . '/filename.txt'))) { continue; } file_put_contents(__DIR__ . '/filename.txt', $fileName); // Download the file $file = file_put_contents(LOCAL_PDF_FILENAME, file_get_contents($pdfFileLink)); if ($file === false) { throw new Exception('Failed to download the RUNLOG PDF file!'); } // Parse pdf file and build necessary objects. $parser = new Parser(); $pdf = $parser->parseFile(LOCAL_PDF_FILENAME); $text = $pdf->getText(); if (($pos = strpos($text, $config['searchName'])) === false) { throw new Exception('Could not find your name in the file! Did you really run?'); } // My name with the run count at the end $me = substr($text, $pos, strpos($text, PHP_EOL, $pos) - $pos); // Get the run count $runCount = intval(substr($me, 8)); // Post the count to Numerous. // NUMEROUS HAS SHUT DOWN!!! // $response = Request::post("https://api.numerousapp.com/v2/metrics/{$config['numerousMetricId']}/events") // ->sendsJson() // tell it we're sending (Content-Type) JSON... // ->authenticateWith($config['numerousApiKey'], '') // authenticate with basic auth... // ->body('{"value":"' . $runCount . '"}') // attach a body/payload...
/** * Handle action related to mime type detection. * These action can be exclude or link to handle custom content (like image, video, pdf, etc ..). * * @param array $mimeInfo From getMimeActionInfo() function * @param string $effective_url Current content url * @param string $body Content from the response * * @return array|null */ private function handleMimeAction($mimeInfo, $effective_url, $body = '') { if (!isset($mimeInfo['action'])) { return; } $infos = array('status' => 200, 'title' => $mimeInfo['name'], 'language' => '', 'html' => '', 'url' => $effective_url, 'content_type' => $mimeInfo['mime'], 'open_graph' => array()); switch ($mimeInfo['action']) { case 'exclude': throw new \Exception(sprintf('This is url "%s" is blocked by mime action.', $effective_url)); case 'link': $infos['html'] = '<a href="' . $effective_url . '">Download ' . $mimeInfo['name'] . '</a>'; if ($mimeInfo['type'] == 'image') { $infos['html'] = '<a href="' . $effective_url . '"><img src="' . $effective_url . '" alt="' . $mimeInfo['name'] . '" /></a>'; } if ($mimeInfo['mime'] == 'application/pdf') { $parser = new PdfParser(); $pdf = $parser->parseFile($effective_url); $infos['html'] = Encoding::toUTF8(nl2br($pdf->getText())); // update title in case of details are present $details = $pdf->getDetails(); // Title can be a string or an array with one key if (isset($details['Title'])) { if (is_array($details['Title']) && isset($details['Title'][0]) && '' !== trim($details['Title'][0])) { $infos['title'] = $details['Title'][0]; } elseif (is_string($details['Title']) && '' !== trim($details['Title'])) { $infos['title'] = $details['Title']; } } } if ($mimeInfo['mime'] == 'text/plain') { $infos['html'] = '<pre>' . $body . '</pre>'; } return $infos; } return; }
public function read_file_pdf($file) { $filepath = $file; $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile($filepath); // Retrieve all pages from the pdf file. $pages = $pdf->getPages(); // Loop over each page to extract text. $text = ''; foreach ($pages as $page) { $text = $text . ' ' . $page->getText(); } return $text; }
/** * * Extract content from resource * * @param \VDB\Spider\Resource $resource * * @return string */ public function extractContentFromResource(Resource $resource) { $pdf = $this->pdfParser->parseContent($resource->getResponse()->getBody(true)); return $this->stripBinaryContent($pdf->getText()); }