Ejemplo n.º 1
0
 public function extractXMLText()
 {
     // Temporary XML output.
     $temp_xml = $this->temp_path . DIRECTORY_SEPARATOR . $this->file_name;
     // SQLite storage.
     $temp_db = $this->pdf_cache_path . DIRECTORY_SEPARATOR . $this->file_name . '.sq3';
     // Database not found. Check the log whether conversion is running. If yes, delay.
     if (!file_exists($temp_db) || filemtime($temp_db) < filemtime($this->pdf_full_path)) {
         for ($i = 1; $i <= 60; $i++) {
             if ($this->checkPDFLog($this->file_name . '.sq3')) {
                 usleep(500000);
             }
         }
     }
     if (!file_exists($temp_db) || filemtime($temp_db) < filemtime($this->pdf_full_path)) {
         // Delete stale database.
         if (file_exists($temp_db)) {
             unlink($temp_db);
         }
         // Write to log file.
         $logHandle = database_connect($this->pdf_cache_path, 'pdflog');
         $logHandle->exec("CREATE TABLE IF NOT EXISTS files (file TEXT PRIMARY KEY)");
         $file_q = $logHandle->quote($this->file_name . '.sq3');
         $insert = $logHandle->exec("INSERT OR IGNORE INTO files VALUES({$file_q})");
         $logHandle = null;
         // IMPORTANT: If no insert due to unique constraint, exit.
         if ($insert == 0) {
             return;
         }
         // XML output file not found. Create one.
         if (!file_exists($temp_xml . '.xml')) {
             system(select_pdftohtml() . ' -q -enc UTF-8 -nomerge -i -hidden -xml "' . $this->pdf_full_path . '" "' . $temp_xml . '"');
         }
         if (!file_exists($temp_xml . '.xml')) {
             sendError('PDF to XML conversion failed.');
         }
         $dbHandle = database_connect($this->pdf_cache_path, $this->file_name);
         $dbHandle->exec("CREATE TABLE IF NOT EXISTS texts (" . "id INTEGER PRIMARY KEY, " . "top TEXT NOT NULL DEFAULT '', " . "left TEXT NOT NULL DEFAULT '', " . "height TEXT NOT NULL DEFAULT '', " . "width TEXT NOT NULL DEFAULT '', " . "text TEXT NOT NULL DEFAULT '', " . "page_number INTEGER NOT NULL DEFAULT '')");
         // Try to repair some malformed files.
         $string = file_get_contents($temp_xml . '.xml');
         $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}\\x{10000}-\\x{10FFFF}]+/u', ' ', $string);
         $string = preg_replace('/\\s{2,}/ui', ' ', $string);
         $string = str_ireplace('<!doctype pdf2xml system "pdf2xml.dtd">', '<!DOCTYPE pdf2xml SYSTEM "pdf2xml.dtd">', $string);
         // Load XML file into object.
         $xml = @simplexml_load_string($string);
         if ($xml === FALSE) {
             sendError('Invalid XML encoding.');
         }
         $dbHandle->beginTransaction();
         // Iterate XML page by page.
         foreach ($xml->page as $page) {
             // Get page number and size.
             foreach ($page->attributes() as $a => $b) {
                 if ($a == 'number') {
                     $page_number = (string) $b;
                 }
                 if ($a == 'height') {
                     $page_height = (string) $b;
                 }
                 if ($a == 'width') {
                     $page_width = (string) $b;
                 }
             }
             // Sanitize db input.
             $page_number_q = $dbHandle->quote($page_number);
             // Get info on each text element.
             $i = 0;
             foreach ($page->text as $row) {
                 $row = strip_tags($row->asXML());
                 foreach ($page->text[$i]->attributes() as $a => $b) {
                     if ($a == 'top') {
                         $row_top = 100 * round($b / $page_height, 3);
                     }
                     if ($a == 'left') {
                         $row_left = 100 * round($b / $page_width, 3);
                     }
                     if ($a == 'height') {
                         $row_height = 100 * round($b / $page_height, 3);
                     }
                     if ($a == 'width') {
                         $row_width = 100 * round($b / $page_width, 3);
                     }
                 }
                 $i = $i + 1;
                 // Sanitize db input.
                 $row_top_q = $dbHandle->quote($row_top);
                 $row_left_q = $dbHandle->quote($row_left);
                 $row_height_q = $dbHandle->quote($row_height);
                 $row_width_q = $dbHandle->quote($row_width);
                 $row_q = $dbHandle->quote($row);
                 $dbHandle->exec("INSERT INTO texts (top,left,height,width,text,page_number) " . "VALUES({$row_top_q}, {$row_left_q}, {$row_height_q}, {$row_width_q}, {$row_q}, {$page_number_q})");
             }
         }
         $dbHandle->commit();
         $dbHandle->exec("CREATE INDEX IF NOT EXISTS ind_pages ON texts(page_number)");
         $dbHandle = null;
         // We are done, delete from log.
         $logHandle = database_connect($this->pdf_cache_path, 'pdflog');
         $file_q = $logHandle->quote($this->file_name . '.sq3');
         $logHandle->exec("DELETE FROM files WHERE file={$file_q}");
         $logHandle = null;
         // Delete XML file.
         unlink($temp_xml . '.xml');
     }
 }
Ejemplo n.º 2
0
    exec(select_pdftotext() . ' test.pdf "' . $temp_dir . DIRECTORY_SEPARATOR . 'test.txt"');
    if (file_exists($temp_dir . DIRECTORY_SEPARATOR . 'test.txt')) {
        unlink($temp_dir . DIRECTORY_SEPARATOR . 'test.txt');
        die('OK');
    } else {
        die;
    }
} elseif ($_GET['binary'] == 'pdfinfo') {
    exec(select_pdfinfo() . ' test.pdf', $output);
    if (!empty($output)) {
        die('OK');
    } else {
        die;
    }
} elseif ($_GET['binary'] == 'pdftohtml') {
    exec(select_pdftohtml() . ' -q -noframes -enc UTF-8 -nomerge -c -xml test.pdf "' . $temp_dir . DIRECTORY_SEPARATOR . 'test"');
    if (file_exists($temp_dir . DIRECTORY_SEPARATOR . 'test.xml')) {
        unlink($temp_dir . DIRECTORY_SEPARATOR . 'test.xml');
        die('OK');
    } else {
        die;
    }
} elseif ($_GET['binary'] == 'ghostscript') {
    exec(select_ghostscript() . ' -sDEVICE=png16m -r15 -dTextAlphaBits=1 -dGraphicsAlphaBits=1 -dFirstPage=1 -dLastPage=1 -o "' . $temp_dir . DIRECTORY_SEPARATOR . 'test.png" test.pdf');
    if (file_exists($temp_dir . DIRECTORY_SEPARATOR . 'test.png')) {
        unlink($temp_dir . DIRECTORY_SEPARATOR . 'test.png');
        die('OK');
    } else {
        die;
    }
} elseif ($_GET['binary'] == 'pdftk') {
Ejemplo n.º 3
0
    $file = dirname(__FILE__) . DIRECTORY_SEPARATOR . 'library' . DIRECTORY_SEPARATOR . $file_name;
    if (!file_exists($file)) {
        die('Error! PDF does not exist!');
    }
} else {
    die('Error! No PDF provided!');
}
$page = '';
if (!empty($_GET['page'])) {
    $page = intval($_GET['page']);
} else {
    die('Error! No page number provided!');
}
$temp_xml = $temp_dir . DIRECTORY_SEPARATOR . $file_name;
if (!file_exists($temp_xml . $page . '.xml') || filemtime($temp_xml . $page . '.xml') < filemtime($file)) {
    system(select_pdftohtml() . ' -q -noframes -enc UTF-8 -nomerge -c -i -xml -f ' . $page . ' -l ' . $page . ' "' . $file . '" "' . $temp_xml . $page . '"');
}
if (file_exists($temp_xml . $page . '.xml')) {
    $string = file_get_contents($temp_xml . $page . '.xml');
    $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $string);
    $string = preg_replace('/\\s{2,}/ui', ' ', $string);
    $string = str_ireplace('<!doctype pdf2xml system "pdf2xml.dtd">', '<!DOCTYPE pdf2xml SYSTEM "pdf2xml.dtd">', $string);
    $xml = @simplexml_load_string($string);
    if (!$xml) {
        die('{"Error":"Invalid XML encoding!"}');
    }
    foreach ($xml->page->attributes() as $a => $b) {
        if ($a == 'height') {
            $page_height = $b;
        }
        if ($a == 'width') {
Ejemplo n.º 4
0
    exec(select_pdftotext() . ' -enc UTF-8 test.pdf "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.txt"');
    if (file_exists(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.txt')) {
        unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.txt');
        die('OK');
    } else {
        die;
    }
} elseif ($_GET['binary'] == 'pdfinfo') {
    exec(select_pdfinfo() . ' test.pdf', $output);
    if (!empty($output)) {
        die('OK');
    } else {
        die;
    }
} elseif ($_GET['binary'] == 'pdftohtml') {
    exec(select_pdftohtml() . ' -q -noframes -enc UTF-8 -nomerge -c -xml test.pdf "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test"');
    if (file_exists(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.xml')) {
        unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.xml');
        die('OK');
    } else {
        die;
    }
} elseif ($_GET['binary'] == 'pdfdetach') {
    exec(select_pdfdetach() . ' -saveall -o "' . IL_TEMP_PATH . '" test.pdf');
    if (is_readable(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.odt') && filesize(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.odt') > 0) {
        unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.odt');
        die('OK');
    } else {
        die;
    }
} elseif ($_GET['binary'] == 'ghostscript') {