Example #1
0
/**
 * ProcessFile
 *
 * Takes a PDF and processes it and then adds it to the database
 *
 * @param sFile The complete filename (inc. path) to be processed
 * @param iSplit Split PDF after X pages, zero means auto detect
 *
 * @return 0 on success,
 *         1 if file could not be accessed
 *         2 if unable to split file into multiple
 *         3 if OCR fails
 *         4 if unable to add document to database
 *         5 if no pages were found
 */
function ProcessFile($sFile, $iSplit = 0)
{
    $objDM = new DataMining();
    if (!file_exists($sFile) || !is_readable($sFile)) {
        printf("Unable to access \"%s\"\n", $sFile);
        return 1;
    }
    printf("Processing %s...\n", basename($sFile));
    flush();
    // TODO: Use identify to see if a picture has content,
    // convert <filename> -colorspace Gray - | identify -format "%[standard-deviation.r]\n%[max.r]\n%[min.r]\n" -
    // std dev / max - min == %, if % < XX then no content (also make sure OCR concurs!)
    printf("  Pass 1: Extract PDF");
    flush();
    // Start by splitting the file and counting pages
    $iPageCount = 0;
    $bRun = true;
    do {
        $sCmd = sprintf("%s 2>&1 -density 300 -depth 8 \"%s\"[%d] %s/page%03d.tif", Config::GetTool("convert"), $sFile, $iPageCount, Config::GetPath("tmp"), $iPageCount);
        exec($sCmd, $aDummy, $iResult);
        if ($iResult == 0) {
            $iPageCount++;
        }
    } while ($iResult == 0);
    printf(", Found %d pages\n", $iPageCount);
    flush();
    if ($iPageCount == 0) {
        printf("Error! No pages found, aborting\n");
        return 5;
    }
    printf("  Pass 2: Search for multiple documents");
    flush();
    $aDocuments = array();
    $aDocument = array();
    if ($iSplit == 0) {
        // This is a cool thing, if we detect the special QR code, we can split
        // the document into multiple PDFs :)
        for ($i = 0; $i < $iPageCount; $i++) {
            $sCmd = sprintf("%s 2>&1 %s/page%03d.tif", Config::GetTool("zbar"), Config::GetPath("tmp"), $i);
            unset($aResult);
            exec($sCmd, $aResult, $iResult);
            if ($iResult == 0) {
                // So, we found SOMETHING, lets see what exactly (we're very picky!)
                $iBarCodes = 0;
                // Count findings...
                foreach ($aResult as $sResult) {
                    if ($sResult == sprintf("QR-Code:%s", $objDM->GetSeparatorText())) {
                        $iBarCodes++;
                    }
                    if (preg_match('/scanned ([0-9]+) barcode/', $sResult, $aCount)) {
                        $iBarTotal = intval($aCount[1]);
                    }
                }
                // Compare to the tally
                if ($iBarCodes == $iBarTotal) {
                    // Good stuff! Separator!
                    if (!empty($aDocument)) {
                        array_push($aDocuments, $aDocument);
                    }
                    $aDocument = array();
                } else {
                    // No separator, go to next
                    array_push($aDocument, $i);
                }
            } else {
                // No barcode at all
                array_push($aDocument, $i);
            }
        }
    } else {
        // Split after every X page
        $c = 0;
        for ($i = 0; $i < $iPageCount; $i++) {
            array_push($aDocument, $i);
            $c++;
            if ($c == $iSplit) {
                $c = 0;
                array_push($aDocuments, $aDocument);
                $aDocument = array();
            }
        }
    }
    // Add potential straggler
    if (!empty($aDocument)) {
        array_push($aDocuments, $aDocument);
    }
    $aFiles = array();
    if (count($aDocuments) > 1) {
        printf(", Found %d\n", count($aDocuments));
        flush();
        printf("  Pass 2b: Splitting...\n");
        flush();
        $iDocument = 0;
        foreach ($aDocuments as $aPages) {
            printf("    Document %d: ", $iDocument + 1);
            flush();
            $sCmd = sprintf("%s %s cat", Config::GetTool("pdftk"), $sFile);
            foreach ($aPages as $sPage) {
                $sCmd .= " " . (intval($sPage) + 1);
            }
            $sTmpFile = sprintf("%s/subdoc%03d.pdf", Config::GetPath("tmp"), $iDocument);
            $sCmd .= sprintf(" output %s", $sTmpFile);
            exec($sCmd);
            if (!file_exists(sprintf("%s", $sTmpFile))) {
                printf("Unable to split document (\"%s\")\n", $sCmd);
                return 2;
            }
            $aFiles[$sTmpFile] = $aPages;
            printf("OK\n");
            $iDocument++;
        }
        // We can now delete the original (since we split it)
        unlink($sFile);
    } else {
        // Use original, since there isn't any multiples
        $aFiles = array($sFile => $aDocuments[0]);
        // Add linebreak so it looks nice :)
        printf("\n");
        flush();
    }
    // Now, OCR the pages and get going
    printf("  Pass 3: OCR the pages\n");
    flush();
    foreach ($aFiles as $sIgnore => $aPages) {
        foreach ($aPages as $iPage) {
            $sCmd = sprintf("%s 2>&1 %s/page%03d.tif %s/page%03d -psm 1 -l " . Config::GetLanguage(), Config::GetTool("ocr"), Config::GetPath("tmp"), $iPage, Config::GetPath("tmp"), $iPage);
            printf("    Page %d: ", $iPage + 1);
            flush();
            exec($sCmd, $aResult, $iResult);
            print_r($aResult);
            if ($iResult != 0) {
                printf("Failed OCR: \"%s\"\n", $sCmd);
                return 3;
            }
            print "OK\n";
            flush();
        }
    }
    printf("  Pass 4: Adding document(s) to database\n");
    flush();
    $i = 1;
    foreach ($aFiles as $sFile => $aPages) {
        printf("    Document %d: ", $i++);
        flush();
        if (!AddDocument($sFile, $aPages, $objDM)) {
            print "Failed\n";
            return 4;
        } else {
            print "OK\n";
        }
        flush();
    }
    print "Done!\n";
    flush();
    return 0;
}