Example #1
0
function AddDocument($sFile, $aPages, $objDM)
{
    // We need a UNIQUE name and directory for this file
    $sDir = strftime("%F/");
    $sDest = strftime("%H.%M.%S_document");
    // Create directory (if not already there)
    if (!is_dir(Config::GetPath("dest") . $sDir)) {
        if (!mkdir(Config::GetPath("dest") . $sDir, Config::GetPermission("dirmask"), false)) {
            printf("Failed to create directory \"%s\"\n", Config::GetPath("dest") . $sDir);
            return FALSE;
        } else {
            // Adjust the rights
            $iOld = error_reporting(E_ERROR);
            chmod(Config::GetPath("dest") . $sDir, Config::GetPermission("dirmask"));
            chgrp(Config::GetPath("dest") . $sDir, Config::GetPermission("group"));
            chown(Config::GetPath("dest") . $sDir, Config::GetPermission("user"));
            error_reporting($iOld);
        }
    }
    // Make sure we don't collide with existing filename
    // TODO: NOT THREAD SAFE!
    $i = 1;
    while (file_exists(Config::GetPath("dest") . $sDir . $sDest . $i . ".pdf")) {
        $i++;
    }
    // Final destination is...
    $sDest = Config::GetPath("dest") . $sDir . $sDest . $i . ".pdf";
    // Time to talk to mysql about this whole thing :)
    $db = mysql_pconnect(Config::getDB("host"), Config::getDB("username"), Config::getDB("password"));
    if (!$db) {
        printf("Cannot connect to database\n");
        return FALSE;
    }
    if (!mysql_select_db(Config::getDB("database"), $db)) {
        printf("Cannot open scanner database\n");
        return FALSE;
    }
    // Time to move the file AND add that into the database
    if (!rename($sFile, $sDest)) {
        printf("Couldn't move \"%s\" into \"%s\"!\n", $sFile, $sDest);
        return FALSE;
    }
    // Adjust the rights
    $iOld = error_reporting(E_ERROR);
    chmod($sDest, Config::GetPermission("filemask"));
    chgrp($sDest, Config::GetPermission("group"));
    chown($sDest, Config::GetPermission("user"));
    error_reporting($iOld);
    $SQL = sprintf("INSERT INTO documents (filename) VALUES ('%s')", $sDest);
    if (mysql_query($SQL, $db) === FALSE) {
        printf("Unable to add document to database: %s\n", mysql_error($db));
        return FALSE;
    }
    $iID = mysql_insert_id($db);
    // Finally, using the ID from the document, we add ALL the pages to it
    $sAllData = "";
    $sSplitter = uniqid(Config::GetSplitter(), true);
    $iPages = 0;
    foreach ($aPages as $iPage) {
        $sFile = sprintf("page%03d.txt", $iPage);
        $sContent = file_get_contents(Config::GetPath("tmp") . $sFile);
        if ($sContent !== FALSE) {
            $sAllData .= strtolower($sContent) . $sSplitter;
            // Space to avoid run-in
            $iPages++;
        } else {
            printf("Unable to load contents of \"%s\" into memory\n", Config::GetPath("tmp") . $sFile);
            return FALSE;
        }
    }
    // Save this data
    $SQL = sprintf("INSERT INTO rawtext VALUES (%d, '%s', '%s')", $iID, mysql_real_escape_string($sAllData), mysql_real_escape_string($sSplitter));
    if (mysql_query($SQL, $db) === FALSE) {
        printf("Failed to insert data document %d: %s\n", $iID, mysql_error($db));
        return FALSE;
    }
    // Also update pagecount
    $SQL = sprintf("UPDATE documents SET pagecount = %d WHERE id = %d", $iPages, $iID);
    if (mysql_query($SQL, $db) === FALSE) {
        printf("Failed to update pagecount for %d: %s\n", $iID, mysql_error($db));
        return FALSE;
    }
    // Reformat the data
    $sAllData = str_replace($sSplitter, "\n", $sAllData);
    // Now for some magic,
    // we try and guess the original date of the data
    $iDate = $objDM->GuessOriginalDate($sAllData);
    if ($iDate !== FALSE && $iDate != 0) {
        $SQL = sprintf("UPDATE documents SET dated = FROM_UNIXTIME(%d) WHERE id = %d", $iDate, $iID);
        $res = mysql_query($SQL, $db);
        if ($res === FALSE) {
            printf("ERROR! Failed to update document date: %s\n", mysql_error($db));
            return FALSE;
        }
    }
    // Finally, using the complete contents, we try and apply a category
    $res = mysql_query("SELECT * FROM categories", $db);
    if ($res === FALSE) {
        printf("WARNING: Was unable to get categories, defaults to unclassified\n");
        return TRUE;
    }
    $aResult = array();
    while (($aRes = mysql_fetch_array($res)) !== FALSE) {
        if (trim($aRes["keywords"]) == "") {
            continue;
        }
        $aWords = preg_split("/[\\s,]*\\\"([^\\\"]+)\\\"[\\s,]*|" . "[\\s,]*'([^']+)'[\\s,]*|" . "[\\s,]+/", $aRes["keywords"], 0, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
        // Count the occurance of the keywords in the document (case insensitive)
        $iHits = 0;
        foreach ($aWords as $sWord) {
            $bNegative = false;
            if ($sWord[0] == '-') {
                $bNegative = true;
                $sWord = substr($sWord, 1);
            }
            $iHit = substr_count($sAllData, $sWord);
            // If _ANY_ keyword fails, this isn't considered a hit at all
            if ($iHit > 0 == $bNegative) {
                $iHits = 0;
                break;
            }
            $iHits += $iHit;
        }
        if ($iHits != 0) {
            array_push($aResult, array("id" => $aRes["id"], "hits" => $iHits));
        }
    }
    if (!empty($aResult)) {
        //printf("%d categorie(s) matched\n", count($aResult));
        //print_r($aResult);
        $iHighest = 0;
        $aChosen = FALSE;
        foreach ($aResult as $aEntry) {
            if ($aEntry["hits"] > $iHighest) {
                $iHighest = $aEntry["hits"];
                $aChosen = $aEntry;
            }
        }
        //printf("  Category %d was chosen due to highest hits.\n", $aChosen["id"]);
        $SQL = sprintf("UPDATE documents SET category = %d WHERE id = %d", $aChosen["id"], $iID);
        $res = mysql_query($SQL, $db);
        if ($res === FALSE) {
            printf("ERROR! Failed to update document category: %s\n", mysql_error($db));
            return FALSE;
        }
    }
    return TRUE;
}