Пример #1
0
 /**
  * Object constructor
  *
  * @param string  $data
  * @param boolean $storeContent
  * @throws NotIndexedException
  */
 private function __construct($data, $storeContent)
 {
     //TODO check PDF >1.5 metadata extraction
     //do the content extraction
     $parser = new Parser();
     try {
         $pdf = $parser->parseContent($data);
         $body = $pdf->getText();
         // Store contents
         if ($storeContent) {
             $this->addField(Document\Field::Text('body', $body, 'UTF-8'));
         } else {
             $this->addField(Document\Field::UnStored('body', $body, 'UTF-8'));
         }
         $details = $pdf->getDetails();
         // Store meta data properties
         foreach ($details as $key => $value) {
             $key = strtolower($key);
             if ($key === 'author') {
                 $key = 'creator';
             }
             $this->addField(Document\Field::Text($key, $value, 'UTF-8'));
         }
     } catch (\Exception $ex) {
         throw new NotIndexedException(null, null, $ex);
     }
 }
Пример #2
0
 /**
  * @When I parse the pdf content
  */
 public function iParseThePdfContent()
 {
     $parser = new Parser();
     $pdf = $parser->parseFile($this->filename);
     $pages = $pdf->getPages();
     $this->metadata = $pdf->getDetails();
     foreach ($pages as $i => $page) {
         $this->pages[++$i] = $page->getText();
     }
 }
Пример #3
0
 /**
  * Handle action related to mime type detection.
  * These action can be exclude or link to handle custom content (like image, video, pdf, etc ..).
  *
  * @param array  $mimeInfo     From getMimeActionInfo() function
  * @param string $effectiveUrl Current content url
  * @param string $body         Content from the response
  *
  * @return array|null
  */
 private function handleMimeAction($mimeInfo, $effectiveUrl, $body = '')
 {
     if (!isset($mimeInfo['action'])) {
         return;
     }
     $infos = array('status' => 200, 'title' => $mimeInfo['name'], 'language' => '', 'html' => '', 'url' => $effectiveUrl, 'content_type' => $mimeInfo['mime'], 'open_graph' => array());
     switch ($mimeInfo['action']) {
         case 'exclude':
             throw new \Exception(sprintf('This is url "%s" is blocked by mime action.', $effectiveUrl));
         case 'link':
             $infos['html'] = '<a href="' . $effectiveUrl . '">Download ' . $mimeInfo['name'] . '</a>';
             if ($mimeInfo['type'] == 'image') {
                 $infos['html'] = '<a href="' . $effectiveUrl . '"><img src="' . $effectiveUrl . '" alt="' . $mimeInfo['name'] . '" /></a>';
             }
             if ($mimeInfo['mime'] == 'application/pdf') {
                 $parser = new PdfParser();
                 $pdf = $parser->parseFile($effectiveUrl);
                 // tiny hack to avoid character like �
                 $html = mb_convert_encoding(nl2br($pdf->getText()), 'UTF-8', 'UTF-8');
                 // strip away unwanted chars (that usualy came from PDF extracted content)
                 // @see http://www.phpwact.org/php/i18n/charsets#common_problem_areas_with_utf-8
                 $html = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $html);
                 $infos['html'] = $html;
                 // update title in case of details are present
                 $details = $pdf->getDetails();
                 // Title can be a string or an array with one key
                 if (isset($details['Title'])) {
                     if (is_array($details['Title']) && isset($details['Title'][0]) && '' !== trim($details['Title'][0])) {
                         $infos['title'] = $details['Title'][0];
                     } elseif (is_string($details['Title']) && '' !== trim($details['Title'])) {
                         $infos['title'] = $details['Title'];
                     }
                 }
             }
             if ($mimeInfo['mime'] == 'text/plain') {
                 $infos['html'] = '<pre>' . $body . '</pre>';
             }
             return $infos;
     }
     return;
 }
Пример #4
0
 public static function analyze($pdf_data)
 {
     $parser = new PdfParser\Parser();
     return new self($parser->parseContent($pdf_data));
 }
Пример #5
0
 public function judge(Request $request)
 {
     $fileName = uniqid();
     //判断请求中是否存在文件,如果存在则保存文件,否则存储为null
     if ($request->hasFile('document')) {
         $path1 = './';
         $request->file('document')->move($path1, $fileName . '.pdf');
     }
     $parser = new Parser();
     $pdf = $parser->parseFile($fileName . '.pdf');
     $text = $pdf->getText();
     $a = strpos($text, '作品名称');
     $b = strpos($text, '学校全称');
     $c = strpos($text, '申报者姓名');
     $d = strpos($text, '类别');
     $e = strpos($text, '■');
     $f = strpos($text, '自然科学类学术论文');
     $g = strpos($text, '哲学社会科学类社会调查报告和学术论文');
     $h = strpos($text, '科技发明制作Α类');
     $i = strpos($text, '科技发明制作Β类');
     $j = strpos($text, '作品撰写的目的和基本思路');
     $k = strpos($text, '作 品 的 科 学性、先进性及独特之处');
     $l = strpos($text, '作品的实际应用价值');
     //作品名称
     $workName = substr($text, $a, $b - $a);
     $workName = str_replace(array('作品名称:', ' '), '', $workName);
     //学校名称
     $schoolName = substr($text, $b, $c - $b);
     $schoolName = str_replace(array('学校全称:', ' '), '', $schoolName);
     //申报人姓名
     $peopleName = substr($text, $c, $d - $c);
     $peopleName = str_replace(array('申报者姓名 (集体名称):', '申报者姓名:', ' '), '', $peopleName);
     //作品目的
     $aim = substr($text, $j, $k - $j);
     $aim = str_replace(array('作品撰写的目的和基本思路', ' '), '', $aim);
     //        echo $aim;
     //作品的科学性、先进性及独特支出
     $brief = substr($text, $k, $l - $k);
     $brief = str_replace(array('作品的科学性、先进性及独特之处', ' ', '作品的科学性、先进性及独特之处'), '', $brief);
     //        echo $brief;
     //        echo $workName;
     //        echo $schoolName;
     //        echo $peopleName;
     $lenth[0] = $f - $e;
     $lenth[1] = $g - $e;
     $lenth[2] = $h - $e;
     $lenth[3] = $i - $e;
     $class = '';
     if ($lenth[0] === 4) {
         $class = '自然科学类学术论文';
     }
     if ($lenth[1] == 4) {
         $class = '哲学社会科学类社会调查报告和学术论文';
     }
     if ($lenth[2] == 4) {
         $class = '科技发明制作Α类';
     }
     if ($lenth[3] == 4) {
         $class = '科技发明制作Β类';
     }
     //        echo $class;
     $info['name'] = $workName;
     $info['school'] = $schoolName;
     $info['people'] = $peopleName;
     $info['big_class'] = $class;
     $info['aim'] = $aim;
     $info['brief'] = $brief;
     $info['document_name'] = $fileName;
     PdfWork::create($info);
     return view('pages.pdfWorkInfo', compact('info'));
 }
Пример #6
0
 $pdfFileLink = $linkNode->attributes['href']->value;
 if ($pdfFileLink === '' || stripos($pdfFileLink, '.pdf') === false) {
     throw new Exception('No RUNLOG PDF link found from the web page!');
 }
 $fileName = basename($pdfFileLink);
 if ($fileName !== '' && $fileName === trim(file_get_contents(__DIR__ . '/filename.txt'))) {
     continue;
 }
 file_put_contents(__DIR__ . '/filename.txt', $fileName);
 // Download the file
 $file = file_put_contents(LOCAL_PDF_FILENAME, file_get_contents($pdfFileLink));
 if ($file === false) {
     throw new Exception('Failed to download the RUNLOG PDF file!');
 }
 // Parse pdf file and build necessary objects.
 $parser = new Parser();
 $pdf = $parser->parseFile(LOCAL_PDF_FILENAME);
 $text = $pdf->getText();
 if (($pos = strpos($text, $config['searchName'])) === false) {
     throw new Exception('Could not find your name in the file! Did you really run?');
 }
 // My name with the run count at the end
 $me = substr($text, $pos, strpos($text, PHP_EOL, $pos) - $pos);
 // Get the run count
 $runCount = intval(substr($me, 8));
 // Post the count to Numerous.
 // NUMEROUS HAS SHUT DOWN!!!
 // $response = Request::post("https://api.numerousapp.com/v2/metrics/{$config['numerousMetricId']}/events")
 //     ->sendsJson()                                       // tell it we're sending (Content-Type) JSON...
 //     ->authenticateWith($config['numerousApiKey'], '')   // authenticate with basic auth...
 //     ->body('{"value":"' . $runCount . '"}')             // attach a body/payload...
Пример #7
0
 /**
  * Handle action related to mime type detection.
  * These action can be exclude or link to handle custom content (like image, video, pdf, etc ..).
  *
  * @param array  $mimeInfo      From getMimeActionInfo() function
  * @param string $effective_url Current content url
  * @param string $body          Content from the response
  *
  * @return array|null
  */
 private function handleMimeAction($mimeInfo, $effective_url, $body = '')
 {
     if (!isset($mimeInfo['action'])) {
         return;
     }
     $infos = array('status' => 200, 'title' => $mimeInfo['name'], 'language' => '', 'html' => '', 'url' => $effective_url, 'content_type' => $mimeInfo['mime'], 'open_graph' => array());
     switch ($mimeInfo['action']) {
         case 'exclude':
             throw new \Exception(sprintf('This is url "%s" is blocked by mime action.', $effective_url));
         case 'link':
             $infos['html'] = '<a href="' . $effective_url . '">Download ' . $mimeInfo['name'] . '</a>';
             if ($mimeInfo['type'] == 'image') {
                 $infos['html'] = '<a href="' . $effective_url . '"><img src="' . $effective_url . '" alt="' . $mimeInfo['name'] . '" /></a>';
             }
             if ($mimeInfo['mime'] == 'application/pdf') {
                 $parser = new PdfParser();
                 $pdf = $parser->parseFile($effective_url);
                 $infos['html'] = Encoding::toUTF8(nl2br($pdf->getText()));
                 // update title in case of details are present
                 $details = $pdf->getDetails();
                 // Title can be a string or an array with one key
                 if (isset($details['Title'])) {
                     if (is_array($details['Title']) && isset($details['Title'][0]) && '' !== trim($details['Title'][0])) {
                         $infos['title'] = $details['Title'][0];
                     } elseif (is_string($details['Title']) && '' !== trim($details['Title'])) {
                         $infos['title'] = $details['Title'];
                     }
                 }
             }
             if ($mimeInfo['mime'] == 'text/plain') {
                 $infos['html'] = '<pre>' . $body . '</pre>';
             }
             return $infos;
     }
     return;
 }
Пример #8
0
 public function read_file_pdf($file)
 {
     $filepath = $file;
     $parser = new \Smalot\PdfParser\Parser();
     $pdf = $parser->parseFile($filepath);
     // Retrieve all pages from the pdf file.
     $pages = $pdf->getPages();
     // Loop over each page to extract text.
     $text = '';
     foreach ($pages as $page) {
         $text = $text . ' ' . $page->getText();
     }
     return $text;
 }
Пример #9
0
 /**
  *
  * Extract content from resource
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return string
  */
 public function extractContentFromResource(Resource $resource)
 {
     $pdf = $this->pdfParser->parseContent($resource->getResponse()->getBody(true));
     return $this->stripBinaryContent($pdf->getText());
 }