public function extractXMLText() { // Temporary XML output. $temp_xml = $this->temp_path . DIRECTORY_SEPARATOR . $this->file_name; // SQLite storage. $temp_db = $this->pdf_cache_path . DIRECTORY_SEPARATOR . $this->file_name . '.sq3'; // Database not found. Check the log whether conversion is running. If yes, delay. if (!file_exists($temp_db) || filemtime($temp_db) < filemtime($this->pdf_full_path)) { for ($i = 1; $i <= 60; $i++) { if ($this->checkPDFLog($this->file_name . '.sq3')) { usleep(500000); } } } if (!file_exists($temp_db) || filemtime($temp_db) < filemtime($this->pdf_full_path)) { // Delete stale database. if (file_exists($temp_db)) { unlink($temp_db); } // Write to log file. $logHandle = database_connect($this->pdf_cache_path, 'pdflog'); $logHandle->exec("CREATE TABLE IF NOT EXISTS files (file TEXT PRIMARY KEY)"); $file_q = $logHandle->quote($this->file_name . '.sq3'); $insert = $logHandle->exec("INSERT OR IGNORE INTO files VALUES({$file_q})"); $logHandle = null; // IMPORTANT: If no insert due to unique constraint, exit. if ($insert == 0) { return; } // XML output file not found. Create one. if (!file_exists($temp_xml . '.xml')) { system(select_pdftohtml() . ' -q -enc UTF-8 -nomerge -i -hidden -xml "' . $this->pdf_full_path . '" "' . $temp_xml . '"'); } if (!file_exists($temp_xml . '.xml')) { sendError('PDF to XML conversion failed.'); } $dbHandle = database_connect($this->pdf_cache_path, $this->file_name); $dbHandle->exec("CREATE TABLE IF NOT EXISTS texts (" . "id INTEGER PRIMARY KEY, " . "top TEXT NOT NULL DEFAULT '', " . "left TEXT NOT NULL DEFAULT '', " . "height TEXT NOT NULL DEFAULT '', " . "width TEXT NOT NULL DEFAULT '', " . "text TEXT NOT NULL DEFAULT '', " . "page_number INTEGER NOT NULL DEFAULT '')"); // Try to repair some malformed files. $string = file_get_contents($temp_xml . '.xml'); $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}\\x{10000}-\\x{10FFFF}]+/u', ' ', $string); $string = preg_replace('/\\s{2,}/ui', ' ', $string); $string = str_ireplace('<!doctype pdf2xml system "pdf2xml.dtd">', '<!DOCTYPE pdf2xml SYSTEM "pdf2xml.dtd">', $string); // Load XML file into object. $xml = @simplexml_load_string($string); if ($xml === FALSE) { sendError('Invalid XML encoding.'); } $dbHandle->beginTransaction(); // Iterate XML page by page. foreach ($xml->page as $page) { // Get page number and size. foreach ($page->attributes() as $a => $b) { if ($a == 'number') { $page_number = (string) $b; } if ($a == 'height') { $page_height = (string) $b; } if ($a == 'width') { $page_width = (string) $b; } } // Sanitize db input. $page_number_q = $dbHandle->quote($page_number); // Get info on each text element. $i = 0; foreach ($page->text as $row) { $row = strip_tags($row->asXML()); foreach ($page->text[$i]->attributes() as $a => $b) { if ($a == 'top') { $row_top = 100 * round($b / $page_height, 3); } if ($a == 'left') { $row_left = 100 * round($b / $page_width, 3); } if ($a == 'height') { $row_height = 100 * round($b / $page_height, 3); } if ($a == 'width') { $row_width = 100 * round($b / $page_width, 3); } } $i = $i + 1; // Sanitize db input. $row_top_q = $dbHandle->quote($row_top); $row_left_q = $dbHandle->quote($row_left); $row_height_q = $dbHandle->quote($row_height); $row_width_q = $dbHandle->quote($row_width); $row_q = $dbHandle->quote($row); $dbHandle->exec("INSERT INTO texts (top,left,height,width,text,page_number) " . "VALUES({$row_top_q}, {$row_left_q}, {$row_height_q}, {$row_width_q}, {$row_q}, {$page_number_q})"); } } $dbHandle->commit(); $dbHandle->exec("CREATE INDEX IF NOT EXISTS ind_pages ON texts(page_number)"); $dbHandle = null; // We are done, delete from log. $logHandle = database_connect($this->pdf_cache_path, 'pdflog'); $file_q = $logHandle->quote($this->file_name . '.sq3'); $logHandle->exec("DELETE FROM files WHERE file={$file_q}"); $logHandle = null; // Delete XML file. unlink($temp_xml . '.xml'); } }
exec(select_pdftotext() . ' test.pdf "' . $temp_dir . DIRECTORY_SEPARATOR . 'test.txt"'); if (file_exists($temp_dir . DIRECTORY_SEPARATOR . 'test.txt')) { unlink($temp_dir . DIRECTORY_SEPARATOR . 'test.txt'); die('OK'); } else { die; } } elseif ($_GET['binary'] == 'pdfinfo') { exec(select_pdfinfo() . ' test.pdf', $output); if (!empty($output)) { die('OK'); } else { die; } } elseif ($_GET['binary'] == 'pdftohtml') { exec(select_pdftohtml() . ' -q -noframes -enc UTF-8 -nomerge -c -xml test.pdf "' . $temp_dir . DIRECTORY_SEPARATOR . 'test"'); if (file_exists($temp_dir . DIRECTORY_SEPARATOR . 'test.xml')) { unlink($temp_dir . DIRECTORY_SEPARATOR . 'test.xml'); die('OK'); } else { die; } } elseif ($_GET['binary'] == 'ghostscript') { exec(select_ghostscript() . ' -sDEVICE=png16m -r15 -dTextAlphaBits=1 -dGraphicsAlphaBits=1 -dFirstPage=1 -dLastPage=1 -o "' . $temp_dir . DIRECTORY_SEPARATOR . 'test.png" test.pdf'); if (file_exists($temp_dir . DIRECTORY_SEPARATOR . 'test.png')) { unlink($temp_dir . DIRECTORY_SEPARATOR . 'test.png'); die('OK'); } else { die; } } elseif ($_GET['binary'] == 'pdftk') {
$file = dirname(__FILE__) . DIRECTORY_SEPARATOR . 'library' . DIRECTORY_SEPARATOR . $file_name; if (!file_exists($file)) { die('Error! PDF does not exist!'); } } else { die('Error! No PDF provided!'); } $page = ''; if (!empty($_GET['page'])) { $page = intval($_GET['page']); } else { die('Error! No page number provided!'); } $temp_xml = $temp_dir . DIRECTORY_SEPARATOR . $file_name; if (!file_exists($temp_xml . $page . '.xml') || filemtime($temp_xml . $page . '.xml') < filemtime($file)) { system(select_pdftohtml() . ' -q -noframes -enc UTF-8 -nomerge -c -i -xml -f ' . $page . ' -l ' . $page . ' "' . $file . '" "' . $temp_xml . $page . '"'); } if (file_exists($temp_xml . $page . '.xml')) { $string = file_get_contents($temp_xml . $page . '.xml'); $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $string); $string = preg_replace('/\\s{2,}/ui', ' ', $string); $string = str_ireplace('<!doctype pdf2xml system "pdf2xml.dtd">', '<!DOCTYPE pdf2xml SYSTEM "pdf2xml.dtd">', $string); $xml = @simplexml_load_string($string); if (!$xml) { die('{"Error":"Invalid XML encoding!"}'); } foreach ($xml->page->attributes() as $a => $b) { if ($a == 'height') { $page_height = $b; } if ($a == 'width') {
exec(select_pdftotext() . ' -enc UTF-8 test.pdf "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.txt"'); if (file_exists(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.txt')) { unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.txt'); die('OK'); } else { die; } } elseif ($_GET['binary'] == 'pdfinfo') { exec(select_pdfinfo() . ' test.pdf', $output); if (!empty($output)) { die('OK'); } else { die; } } elseif ($_GET['binary'] == 'pdftohtml') { exec(select_pdftohtml() . ' -q -noframes -enc UTF-8 -nomerge -c -xml test.pdf "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test"'); if (file_exists(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.xml')) { unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.xml'); die('OK'); } else { die; } } elseif ($_GET['binary'] == 'pdfdetach') { exec(select_pdfdetach() . ' -saveall -o "' . IL_TEMP_PATH . '" test.pdf'); if (is_readable(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.odt') && filesize(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.odt') > 0) { unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.odt'); die('OK'); } else { die; } } elseif ($_GET['binary'] == 'ghostscript') {