Пример #1
0
function rebuild_old_docs()
{
    $result = mysql_query('SELECT * FROM `attachment` WHERE `text` IS NULL');
    include_once './lib/DocumentToText.php';
    $countOK = 0;
    $countError = 0;
    while ($attachment = mysql_fetch_object($result)) {
        $doc2txt = new DocumentToText();
        $doc2txt->convert('attachments/' . $attachment->directory_name . $attachment->stored_filename, $doc2txt->getDocumentType('attachments/' . $attachment->directory_name . $attachment->stored_filename));
        if ($doc2txt->isError()) {
            $countError++;
            print 'Error while converting ' . $attachment->stored_filename . " file\n";
        } else {
            $extractedText = $doc2txt->getString();
            print 'File ' . $attachment->stored_filename . " reindexed.\n";
            $sql = 'UPDATE `attachment` SET `text` = \'' . addslashes($extractedText) . '\', `md5_sum_text` = \'' . md5($extractedText) . '\'  WHERE `attachment_id` = ' . $attachment->attachment_id;
            $upd = mysql_query($sql);
            if (!$upd) {
                $countError++;
                print 'DB error: ' . mysql_error();
            } else {
                $countOK++;
            }
        }
        unset($doc2txt);
    }
    print 'Success/Fail counters:' . $countOK . '/' . $countError;
}
Пример #2
0
 /**
  * Creates an attachment to the specified data item. This will also pass
  * the attachment along for text extraction and indexing if requested. This
  * method supports the above createFrom*() methods; however it may also be
  * called directly.
  *
  * @param flag Data Item type flag.
  * @param integer Data Item ID.
  * @param boolean Is this a profile image attachment?
  * @param boolean Attempt to extract, store, and index the attachment's
  *                text?
  * @param string Attachment title, or boolean false to create a title
  *               automatically from the attachment's filename.
  * @param string The filename an attachment originally before any renaming.
  * @param string The temporary location where the file is currently stored
  *               on the system where CATS is located.
  * @param string MIME content type (or '' if unknown).
  * @param string File's contents if a file is being created from text /
  *               contents. Specify false if not creating a file by its
  *               contents.
  * @param boolean Does this file actually exist? If true, the file will be
  *                moved to the created attachment directory automatically.
  *                If false, the caller is responsible. This has no effect
  *                if $fileContents is not false.
  * @return boolean Was the attachment created successfully?
  */
 public function createGeneric($dataItemType, $dataItemID, $isProfileImage, $extractText, $title, $originalFilename, $tempFilename, $contentType, $fileContents, $fileExists)
 {
     /* Make a 'safe' filename with only standard ASCII characters. */
     $storedFilename = FileUtility::makeSafeFilename($originalFilename);
     /* Create an attachment title. */
     $attachmentTitle = FileUtility::getFileWithoutExtension($originalFilename);
     /* Make attachment searchable. */
     if (!$extractText) {
         $extractedText = '';
     } else {
         $documentToText = new DocumentToText();
         $documentType = $documentToText->getDocumentType($storedFilename, $contentType);
         /* If we're creating a file from text contents, we can skip
          * extracting because we already know the text contents.
          */
         if ($fileContents !== false && $documentType == DOCUMENT_TYPE_TEXT) {
             $extractedText = $fileContents;
         } else {
             if ($fileContents !== false) {
                 /* If it's not text and we are creating a file from contents,
                  * don't try to extract text.
                  */
                 $extractedText = '';
             } else {
                 if (!$fileExists) {
                     /* Can't extract text from a file that doesn't exist. */
                     $extractedText = '';
                 } else {
                     $documentToText->convert($tempFilename, $documentType);
                     if ($documentToText->isError()) {
                         $this->_isTextExtractionError = true;
                         $this->_textExtractionError = $documentToText->getError();
                         $extractedText = '';
                     } else {
                         $extractedText = $documentToText->getString();
                     }
                     /* If we are adding a bulk resume, and parsing fails, consider it
                      * a fatal error.
                      */
                     if ($dataItemType == DATA_ITEM_BULKRESUME && $this->_isTextExtractionError) {
                         $this->_isError = true;
                         $this->_error = $this->_textExtractionError;
                         return false;
                     }
                 }
             }
         }
     }
     $attachments = new Attachments($this->_siteID);
     /* We can only check for duplicates right now if the file actually
      * exists. We'll do it again later below.
      */
     if ($fileExists && !$fileContents) {
         /* We store file size in KB, rounded to nearest KB. */
         $fileSize = round(@filesize($tempFilename) / 1024);
         /* The md5sum is stored for duplicate checking. */
         $md5sum = @md5_file($tempFilename);
         /* Check for duplicates. */
         $duplicates = $attachments->getMatching($dataItemType, $dataItemID, $fileSize, $md5sum, $extractedText);
         /* Duplicate attachments are never added, but this is not a fatal
          * error. We will set a property to notify the caller that a
          * duplicate occurred.
          */
         if (!empty($duplicates)) {
             $this->_duplicatesOccurred = true;
             if (file_exists($tempFilename)) {
                 unlink($tempFilename);
             }
             return false;
         }
     } else {
         $fileSize = 0;
         $md5sum = '';
     }
     /* Add the attachment record. At this point, there is no actual
      * associated directory / full file path.
      */
     $attachmentID = $attachments->add($dataItemType, $dataItemID, $attachmentTitle, $originalFilename, $storedFilename, $contentType, $extractText, $extractedText, $isProfileImage, '', $fileSize, $md5sum);
     /* Were we successful? */
     if (!$attachmentID) {
         $this->_isError = true;
         $this->_error = 'Error adding attachment to the database.';
         @unlink($tempFilename);
         return false;
     }
     /* Store the extracted text and attachment ID in properties for later
      * access.
      */
     $this->_extractedText = $extractedText;
     $this->_attachmentID = $attachmentID;
     /* Create the attachment directory. */
     $uniqueDirectory = $this->_createDirectory($attachmentID, $storedFilename);
     if (!$uniqueDirectory) {
         $attachments->delete($attachmentID, false);
         return false;
     }
     /* Create the full path name to the file. */
     $newFileFullPath = $uniqueDirectory . $storedFilename;
     /* Are we creating a new file from file contents, or are we moving a
      * temporary file?
      */
     if ($fileContents !== false) {
         $status = @file_put_contents($newFileFullPath, $fileContents);
         if (!$status) {
             $this->_isError = true;
             $this->_error = sprintf('Cannot create file %s.', $newFileFullPath);
             $attachments->delete($attachmentID, false);
             @unlink($uniqueDirectory);
             return false;
         }
         /* We store file size in KB, rounded to nearest KB. */
         $fileSize = round(@filesize($newFileFullPath) / 1024);
         /* The md5sum is stored for duplicate checking. */
         $md5sum = @md5_file($newFileFullPath);
         /* Check for duplicates. */
         $duplicates = $attachments->getMatching($dataItemType, $dataItemID, $fileSize, $md5sum, $extractedText);
         /* Duplicate attachments are never added, but this is not a fatal
          * error. We will set a property to notify the caller that a
          * duplicate occurred.
          */
         if (!empty($duplicates)) {
             $this->_duplicatesOccurred = true;
             $attachments->delete($attachmentID, false);
             @unlink($newFileFullPath);
             @unlink($uniqueDirectory);
             return false;
         }
     } else {
         if ($fileExists) {
             /* Copy the temp file to the new path. */
             if (!@copy($tempFilename, $newFileFullPath)) {
                 $this->_isError = true;
                 $this->_error = sprintf('Cannot copy temporary file %s to %s.', $tempFilename, $newFileFullPath);
                 $attachments->delete($attachmentID, false);
                 @unlink($newFileFullPath);
                 @unlink($uniqueDirectory);
                 return false;
             }
             /* Try to remove the temp file; if it fails it doesn't matter. */
             @unlink($tempFilename);
         }
     }
     /* Store path to the file (inside the attachments directory) in this
      * object.
      */
     $this->_newFilePath = $newFileFullPath;
     $this->_containingDirectory = $uniqueDirectory;
     /* Update the database with the new directory name. */
     $attachments->setDirectoryName($attachmentID, str_replace('./attachments/', '', $uniqueDirectory));
     if (!eval(Hooks::get('CREATE_ATTACHMENT_FINISHED'))) {
         return;
     }
     return true;
 }
Пример #3
0
 private function storeMonsterResumeText()
 {
     $this->_authenticate();
     if (!isset($_POST['resumeText'])) {
         $this->fatal('No resume.');
     }
     $resumeText = $_POST['resumeText'];
     /* The toolbar inputs the BODY of the monster page.  First, we convert
      * the HTML of the BODY into text with html2text...
      */
     $temporaryFile = FileUtility::makeRandomTemporaryFilePath() . '.html';
     if (file_put_contents($temporaryFile, $resumeText) === false) {
         $this->fatal('Failed to save data for parsing.');
     }
     $documentToText = new DocumentToText();
     $documentType = $documentToText->getDocumentType($temporaryFile, 'text/html');
     $documentToText->convert($temporaryFile, $documentType);
     if ($documentToText->isError()) {
         $this->_isTextExtractionError = true;
         $this->_textExtractionError = $documentToText->getError();
         $parsedText = '';
     } else {
         $parsedText = $documentToText->getString();
     }
     @unlink($temporaryFile);
     /* Now, we have to determine where the resume begins and ends and cut out the
      * top and bottom of the resume...
      */
     $parsedTextArray = explode("\n", $parsedText);
     $firstLine = 0;
     $lastLine = count($parsedTextArray) - 1;
     foreach ($parsedTextArray as $line => $data) {
         /* Find first line */
         if ((strpos($data, 'RESUME') !== false || strpos($data, 'CV') !== false) && strpos($data, '^BACK_TO_TOP') !== false && $firstLine == 0) {
             $firstLine = $line + 1;
         }
         /* Find last line */
         if (strpos($data, '^BACK_TO_TOP') !== false || strpos($data, 'Back_to_top') !== false || strpos($data, 'Back to top') !== false) {
             $lastLine = $line - 1;
         }
         /* TODO:  Look for more keywords present at the bottom of this page
          * in case Back_top_top goes away
          */
         /* Remove the back to top links from the resume to prevent indexing */
         if (strpos($data, '^BACK_TO_TOP') !== false) {
             $data = str_replace('^BACK TO TOP', '', $data);
         }
         /* Convert bullet points into - symbols. */
         $data = str_replace('%u2022', '-', $data);
         $parsedTextArray[$line] = $data;
     }
     $parsedTextArray = array_slice($parsedTextArray, $firstLine, $lastLine - $firstLine + 1);
     $parsedText = implode("\n", $parsedTextArray);
     /* Remember the output in the session and return to the toolbar
      * the ID number of the data.
      */
     $storedID = $_SESSION['CATS']->storeData($parsedText);
     echo $storedID;
     flush();
 }
Пример #4
0
if (file_exists('INSTALL_BLOCK') && ($_SESSION['CATS']->getAccessLevel() < ACCESS_LEVEL_SA || ModuleUtility::moduleExists('asp'))) {
    die('No permision.');
}
$db = DatabaseConnection::getInstance();
$rs = $db->getAllAssoc('SELECT site_id, attachment_id, directory_name, stored_filename FROM attachment WHERE text = "" OR isnull(text) AND resume = 1');
foreach ($rs as $index => $data) {
    /* Attempt to reindex file. */
    $storedFilename = './attachments/' . $data['directory_name'] . '/' . $data['stored_filename'];
    $documentToText = new DocumentToText();
    $documentType = $documentToText->getDocumentType($storedFilename);
    $fileContents = @file_get_contents($storedFilename);
    /* If we're creating a file from text contents, we can skip
     * extracting because we already know the text contents.
     */
    if ($fileContents !== false && $documentType == DOCUMENT_TYPE_TEXT) {
        $extractedText = $fileContents;
    } else {
        if (!file_exists($storedFilename)) {
            /* Can't extract text from a file that doesn't exist. */
            $extractedText = '';
        } else {
            $documentToText->convert($storedFilename, $documentType);
            if (!$documentToText->isError()) {
                $extractedText = $documentToText->getString();
                $reindexed++;
                $db->query('UPDATE attachment SET text = ' . $db->makeQueryString($extractedText) . ' WHERE attachment_id = ' . $data['attachment_id']);
            }
        }
    }
}
echo $reindexed;