/** * ProcessFile * * Takes a PDF and processes it and then adds it to the database * * @param sFile The complete filename (inc. path) to be processed * @param iSplit Split PDF after X pages, zero means auto detect * * @return 0 on success, * 1 if file could not be accessed * 2 if unable to split file into multiple * 3 if OCR fails * 4 if unable to add document to database * 5 if no pages were found */ function ProcessFile($sFile, $iSplit = 0) { $objDM = new DataMining(); if (!file_exists($sFile) || !is_readable($sFile)) { printf("Unable to access \"%s\"\n", $sFile); return 1; } printf("Processing %s...\n", basename($sFile)); flush(); // TODO: Use identify to see if a picture has content, // convert <filename> -colorspace Gray - | identify -format "%[standard-deviation.r]\n%[max.r]\n%[min.r]\n" - // std dev / max - min == %, if % < XX then no content (also make sure OCR concurs!) printf(" Pass 1: Extract PDF"); flush(); // Start by splitting the file and counting pages $iPageCount = 0; $bRun = true; do { $sCmd = sprintf("%s 2>&1 -density 300 -depth 8 \"%s\"[%d] %s/page%03d.tif", Config::GetTool("convert"), $sFile, $iPageCount, Config::GetPath("tmp"), $iPageCount); exec($sCmd, $aDummy, $iResult); if ($iResult == 0) { $iPageCount++; } } while ($iResult == 0); printf(", Found %d pages\n", $iPageCount); flush(); if ($iPageCount == 0) { printf("Error! No pages found, aborting\n"); return 5; } printf(" Pass 2: Search for multiple documents"); flush(); $aDocuments = array(); $aDocument = array(); if ($iSplit == 0) { // This is a cool thing, if we detect the special QR code, we can split // the document into multiple PDFs :) for ($i = 0; $i < $iPageCount; $i++) { $sCmd = sprintf("%s 2>&1 %s/page%03d.tif", Config::GetTool("zbar"), Config::GetPath("tmp"), $i); unset($aResult); exec($sCmd, $aResult, $iResult); if ($iResult == 0) { // So, we found SOMETHING, lets see what exactly (we're very picky!) $iBarCodes = 0; // Count findings... foreach ($aResult as $sResult) { if ($sResult == sprintf("QR-Code:%s", $objDM->GetSeparatorText())) { $iBarCodes++; } if (preg_match('/scanned ([0-9]+) barcode/', $sResult, $aCount)) { $iBarTotal = intval($aCount[1]); } } // Compare to the tally if ($iBarCodes == $iBarTotal) { // Good stuff! Separator! if (!empty($aDocument)) { array_push($aDocuments, $aDocument); } $aDocument = array(); } else { // No separator, go to next array_push($aDocument, $i); } } else { // No barcode at all array_push($aDocument, $i); } } } else { // Split after every X page $c = 0; for ($i = 0; $i < $iPageCount; $i++) { array_push($aDocument, $i); $c++; if ($c == $iSplit) { $c = 0; array_push($aDocuments, $aDocument); $aDocument = array(); } } } // Add potential straggler if (!empty($aDocument)) { array_push($aDocuments, $aDocument); } $aFiles = array(); if (count($aDocuments) > 1) { printf(", Found %d\n", count($aDocuments)); flush(); printf(" Pass 2b: Splitting...\n"); flush(); $iDocument = 0; foreach ($aDocuments as $aPages) { printf(" Document %d: ", $iDocument + 1); flush(); $sCmd = sprintf("%s %s cat", Config::GetTool("pdftk"), $sFile); foreach ($aPages as $sPage) { $sCmd .= " " . (intval($sPage) + 1); } $sTmpFile = sprintf("%s/subdoc%03d.pdf", Config::GetPath("tmp"), $iDocument); $sCmd .= sprintf(" output %s", $sTmpFile); exec($sCmd); if (!file_exists(sprintf("%s", $sTmpFile))) { printf("Unable to split document (\"%s\")\n", $sCmd); return 2; } $aFiles[$sTmpFile] = $aPages; printf("OK\n"); $iDocument++; } // We can now delete the original (since we split it) unlink($sFile); } else { // Use original, since there isn't any multiples $aFiles = array($sFile => $aDocuments[0]); // Add linebreak so it looks nice :) printf("\n"); flush(); } // Now, OCR the pages and get going printf(" Pass 3: OCR the pages\n"); flush(); foreach ($aFiles as $sIgnore => $aPages) { foreach ($aPages as $iPage) { $sCmd = sprintf("%s 2>&1 %s/page%03d.tif %s/page%03d -psm 1 -l " . Config::GetLanguage(), Config::GetTool("ocr"), Config::GetPath("tmp"), $iPage, Config::GetPath("tmp"), $iPage); printf(" Page %d: ", $iPage + 1); flush(); exec($sCmd, $aResult, $iResult); print_r($aResult); if ($iResult != 0) { printf("Failed OCR: \"%s\"\n", $sCmd); return 3; } print "OK\n"; flush(); } } printf(" Pass 4: Adding document(s) to database\n"); flush(); $i = 1; foreach ($aFiles as $sFile => $aPages) { printf(" Document %d: ", $i++); flush(); if (!AddDocument($sFile, $aPages, $objDM)) { print "Failed\n"; return 4; } else { print "OK\n"; } flush(); } print "Done!\n"; flush(); return 0; }