function AddDocument($sFile, $aPages, $objDM) { // We need a UNIQUE name and directory for this file $sDir = strftime("%F/"); $sDest = strftime("%H.%M.%S_document"); // Create directory (if not already there) if (!is_dir(Config::GetPath("dest") . $sDir)) { if (!mkdir(Config::GetPath("dest") . $sDir, Config::GetPermission("dirmask"), false)) { printf("Failed to create directory \"%s\"\n", Config::GetPath("dest") . $sDir); return FALSE; } else { // Adjust the rights $iOld = error_reporting(E_ERROR); chmod(Config::GetPath("dest") . $sDir, Config::GetPermission("dirmask")); chgrp(Config::GetPath("dest") . $sDir, Config::GetPermission("group")); chown(Config::GetPath("dest") . $sDir, Config::GetPermission("user")); error_reporting($iOld); } } // Make sure we don't collide with existing filename // TODO: NOT THREAD SAFE! $i = 1; while (file_exists(Config::GetPath("dest") . $sDir . $sDest . $i . ".pdf")) { $i++; } // Final destination is... $sDest = Config::GetPath("dest") . $sDir . $sDest . $i . ".pdf"; // Time to talk to mysql about this whole thing :) $db = mysql_pconnect(Config::getDB("host"), Config::getDB("username"), Config::getDB("password")); if (!$db) { printf("Cannot connect to database\n"); return FALSE; } if (!mysql_select_db(Config::getDB("database"), $db)) { printf("Cannot open scanner database\n"); return FALSE; } // Time to move the file AND add that into the database if (!rename($sFile, $sDest)) { printf("Couldn't move \"%s\" into \"%s\"!\n", $sFile, $sDest); return FALSE; } // Adjust the rights $iOld = error_reporting(E_ERROR); chmod($sDest, Config::GetPermission("filemask")); chgrp($sDest, Config::GetPermission("group")); chown($sDest, Config::GetPermission("user")); error_reporting($iOld); $SQL = sprintf("INSERT INTO documents (filename) VALUES ('%s')", $sDest); if (mysql_query($SQL, $db) === FALSE) { printf("Unable to add document to database: %s\n", mysql_error($db)); return FALSE; } $iID = mysql_insert_id($db); // Finally, using the ID from the document, we add ALL the pages to it $sAllData = ""; $sSplitter = uniqid(Config::GetSplitter(), true); $iPages = 0; foreach ($aPages as $iPage) { $sFile = sprintf("page%03d.txt", $iPage); $sContent = file_get_contents(Config::GetPath("tmp") . $sFile); if ($sContent !== FALSE) { $sAllData .= strtolower($sContent) . $sSplitter; // Space to avoid run-in $iPages++; } else { printf("Unable to load contents of \"%s\" into memory\n", Config::GetPath("tmp") . $sFile); return FALSE; } } // Save this data $SQL = sprintf("INSERT INTO rawtext VALUES (%d, '%s', '%s')", $iID, mysql_real_escape_string($sAllData), mysql_real_escape_string($sSplitter)); if (mysql_query($SQL, $db) === FALSE) { printf("Failed to insert data document %d: %s\n", $iID, mysql_error($db)); return FALSE; } // Also update pagecount $SQL = sprintf("UPDATE documents SET pagecount = %d WHERE id = %d", $iPages, $iID); if (mysql_query($SQL, $db) === FALSE) { printf("Failed to update pagecount for %d: %s\n", $iID, mysql_error($db)); return FALSE; } // Reformat the data $sAllData = str_replace($sSplitter, "\n", $sAllData); // Now for some magic, // we try and guess the original date of the data $iDate = $objDM->GuessOriginalDate($sAllData); if ($iDate !== FALSE && $iDate != 0) { $SQL = sprintf("UPDATE documents SET dated = FROM_UNIXTIME(%d) WHERE id = %d", $iDate, $iID); $res = mysql_query($SQL, $db); if ($res === FALSE) { printf("ERROR! Failed to update document date: %s\n", mysql_error($db)); return FALSE; } } // Finally, using the complete contents, we try and apply a category $res = mysql_query("SELECT * FROM categories", $db); if ($res === FALSE) { printf("WARNING: Was unable to get categories, defaults to unclassified\n"); return TRUE; } $aResult = array(); while (($aRes = mysql_fetch_array($res)) !== FALSE) { if (trim($aRes["keywords"]) == "") { continue; } $aWords = preg_split("/[\\s,]*\\\"([^\\\"]+)\\\"[\\s,]*|" . "[\\s,]*'([^']+)'[\\s,]*|" . "[\\s,]+/", $aRes["keywords"], 0, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); // Count the occurance of the keywords in the document (case insensitive) $iHits = 0; foreach ($aWords as $sWord) { $bNegative = false; if ($sWord[0] == '-') { $bNegative = true; $sWord = substr($sWord, 1); } $iHit = substr_count($sAllData, $sWord); // If _ANY_ keyword fails, this isn't considered a hit at all if ($iHit > 0 == $bNegative) { $iHits = 0; break; } $iHits += $iHit; } if ($iHits != 0) { array_push($aResult, array("id" => $aRes["id"], "hits" => $iHits)); } } if (!empty($aResult)) { //printf("%d categorie(s) matched\n", count($aResult)); //print_r($aResult); $iHighest = 0; $aChosen = FALSE; foreach ($aResult as $aEntry) { if ($aEntry["hits"] > $iHighest) { $iHighest = $aEntry["hits"]; $aChosen = $aEntry; } } //printf(" Category %d was chosen due to highest hits.\n", $aChosen["id"]); $SQL = sprintf("UPDATE documents SET category = %d WHERE id = %d", $aChosen["id"], $iID); $res = mysql_query($SQL, $db); if ($res === FALSE) { printf("ERROR! Failed to update document category: %s\n", mysql_error($db)); return FALSE; } } return TRUE; }