/** * Check if the uploaded file has a similar name to an existing * file which would then be a candidate for a revised file. * @param $uploadedFile SubmissionFile * @param $submissionFiles array a list of submission files to * check the uploaded file against. * @return integer the if of the possibly revised file or null * if no matches were found. */ function &_checkForRevision(&$uploadedFile, &$submissionFiles) { // Get the file name. $uploadedFileName = $uploadedFile->getOriginalFileName(); // Start with the minimal required similarity. $minPercentage = SUBMISSION_MIN_SIMILARITY_OF_REVISION; // Find out whether one of the files belonging to the current // file stage matches the given file name. $possibleRevisedFileId = null; $matchedPercentage = 0; foreach ($submissionFiles as $submissionFile) { /* @var $submissionFile SubmissionFile */ // Do not consider the uploaded file itself. if ($uploadedFile->getFileId() == $submissionFile->getFileId()) { continue; } // Do not consider files from different publication formats. if ($uploadedFile->getAssocType() == ASSOC_TYPE_REPRESENTATION && $submissionFile->getAssocType() == ASSOC_TYPE_REPRESENTATION && $uploadedFile->getAssocId() != $submissionFile->getAssocId()) { continue; } // Test whether the current submission file is similar // to the uploaded file. (Transliterate to ASCII -- the // similar_text function can't handle UTF-8.) import('lib.pkp.classes.core.Transcoder'); $transcoder = new Transcoder('UTF-8', 'ASCII', true); similar_text($a = $transcoder->trans($uploadedFileName), $b = $transcoder->trans($submissionFile->getOriginalFileName()), $matchedPercentage); if ($matchedPercentage > $minPercentage && !$this->_onlyNumbersDiffer($a, $b)) { // We found a file that might be a possible revision. $possibleRevisedFileId = $submissionFile->getFileId(); // Reset the min percentage to this comparison's precentage // so that only better matches will be considered from now on. $minPercentage = $matchedPercentage; } } // Return the id of the file that we found similar. return $possibleRevisedFileId; }
/** * Normalize a string in an unknown (non-UTF8) encoding into a valid UTF-8 sequence * @param $str string input string * @return string */ static function utf8_normalize($str) { import('lib.pkp.classes.core.Transcoder'); if (String::hasMBString()) { // NB: CP-1252 often segfaults; we've left it out here but it will detect as 'ISO-8859-1' $mb_encoding_order = 'UTF-8, UTF-7, ASCII, ISO-8859-1, EUC-JP, SJIS, eucJP-win, SJIS-win, JIS, ISO-2022-JP'; $detected_encoding = mb_detect_encoding($str, $mb_encoding_order, false); } elseif (function_exists('iconv') && strlen(iconv('CP1252', 'UTF-8', $str)) != strlen(iconv('ISO-8859-1', 'UTF-8', $str))) { // use iconv to detect CP-1252, assuming default ISO-8859-1 $detected_encoding = 'CP1252'; } else { // assume ISO-8859-1, PHP default $detected_encoding = 'ISO-8859-1'; } // transcode CP-1252/ISO-8859-1 into HTML entities; this works because CP-1252 is mapped onto ISO-8859-1 if ('ISO-8859-1' == $detected_encoding || 'CP1252' == $detected_encoding) { $trans = new Transcoder('CP1252', 'HTML-ENTITIES'); $str = $trans->trans($str); } // transcode from detected encoding to to UTF-8 $trans = new Transcoder($detected_encoding, 'UTF-8'); $str = $trans->trans($str); return $str; }