Example #1
0
/**
 * Convert a string into valid UTF-8. This function is quite slow.
 *
 * When invalid byte subsequences are encountered, they will be replaced with
 * U+FFFD, the Unicode replacement character.
 *
 * @param   string  String to convert to valid UTF-8.
 * @return  string  String with invalid UTF-8 byte subsequences replaced with
 *                  U+FFFD.
 */
function phutil_utf8ize($string)
{
    if (phutil_is_utf8($string)) {
        return $string;
    }
    // There is no function to do this in iconv, mbstring or ICU to do this, so
    // do it (very very slowly) in pure PHP.
    // TODO: Provide an optional fast C implementation ala fb_utf8ize() if this
    // ever shows up in profiles?
    // NOTE: Overlong 3-byte and 4-byte representations incorrectly survive
    // this function.
    $result = array();
    $regex = "/([-]" . "|[Â-ß][€-¿]" . "|[à-ï][€-¿][€-¿]" . "|[ð-ô][€-¿][€-¿][€-¿])" . "|(.)/";
    $offset = 0;
    $matches = null;
    while (preg_match($regex, $string, $matches, 0, $offset)) {
        if (!isset($matches[2])) {
            $result[] = $matches[1];
        } else {
            // Unicode replacement character, U+FFFD.
            $result[] = "�";
        }
        $offset += strlen($matches[0]);
    }
    return implode('', $result);
}
 /**
  * Check inserts for characters outside of the BMP. Even with the strictest
  * settings, MySQL will silently truncate data when it encounters these, which
  * can lead to data loss and security problems.
  */
 protected function validateUTF8String($string)
 {
     if (phutil_is_utf8($string)) {
         return;
     }
     throw new AphrontCharacterSetQueryException(pht('Attempting to construct a query using a non-utf8 string when ' . 'utf8 is expected. Use the `%%B` conversion to escape binary ' . 'strings data.'));
 }
Example #3
0
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
if ($argc > 1) {
    $_SERVER['PHABRICATOR_ENV'] = $argv[1];
}
$root = dirname(dirname(dirname(__FILE__)));
require_once $root . '/scripts/__init_script__.php';
require_once $root . '/externals/mimemailparser/MimeMailParser.class.php';
$parser = new MimeMailParser();
$parser->setText(file_get_contents('php://stdin'));
$text_body = $parser->getMessageBody('text');
$text_body_headers = $parser->getMessageBodyHeaders('text');
$content_type = idx($text_body_headers, 'content-type');
if (!phutil_is_utf8($text_body) && (preg_match('/charset="(.*?)"/', $content_type, $matches) || preg_match('/charset=(\\S+)/', $content_type, $matches))) {
    $text_body = phutil_utf8_convert($text_body, "UTF-8", $matches[1]);
}
$headers = $parser->getHeaders();
$headers['subject'] = iconv_mime_decode($headers['subject'], 0, "UTF-8");
$headers['from'] = iconv_mime_decode($headers['from'], 0, "UTF-8");
$received = new PhabricatorMetaMTAReceivedMail();
$received->setHeaders($headers);
$received->setBodies(array('text' => $text_body, 'html' => $parser->getMessageBody('html')));
$attachments = array();
foreach ($parser->getAttachments() as $attachment) {
    if (preg_match('@text/(plain|html)@', $attachment->getContentType()) && $attachment->getContentDisposition() == 'inline') {
        // If this is an "inline" attachment with some sort of text content-type,
        // do not treat it as a file for attachment. MimeMailParser already picked
        // it up in the getMessageBody() call above. We still want to treat 'inline'
        // attachments with other content types (e.g., images) as attachments.
Example #4
0
 protected function generateChanges()
 {
     $parser = $this->newDiffParser();
     $is_raw = $this->isRawDiffSource();
     if ($is_raw) {
         if ($this->getArgument('raw')) {
             fwrite(STDERR, pht('Reading diff from stdin...') . "\n");
             $raw_diff = file_get_contents('php://stdin');
         } else {
             if ($this->getArgument('raw-command')) {
                 list($raw_diff) = execx('%C', $this->getArgument('raw-command'));
             } else {
                 throw new Exception(pht('Unknown raw diff source.'));
             }
         }
         $changes = $parser->parseDiff($raw_diff);
         foreach ($changes as $key => $change) {
             // Remove "message" changes, e.g. from "git show".
             if ($change->getType() == ArcanistDiffChangeType::TYPE_MESSAGE) {
                 unset($changes[$key]);
             }
         }
         return $changes;
     }
     $repository_api = $this->getRepositoryAPI();
     if ($repository_api instanceof ArcanistSubversionAPI) {
         $paths = $this->generateAffectedPaths();
         $this->primeSubversionWorkingCopyData($paths);
         // Check to make sure the user is diffing from a consistent base revision.
         // This is mostly just an abuse sanity check because it's silly to do this
         // and makes the code more difficult to effectively review, but it also
         // affects patches and makes them nonportable.
         $bases = $repository_api->getSVNBaseRevisions();
         // Remove all files with baserev "0"; these files are new.
         foreach ($bases as $path => $baserev) {
             if ($bases[$path] <= 0) {
                 unset($bases[$path]);
             }
         }
         if ($bases) {
             $rev = reset($bases);
             $revlist = array();
             foreach ($bases as $path => $baserev) {
                 $revlist[] = '    ' . pht('Revision %s, %s', $baserev, $path);
             }
             $revlist = implode("\n", $revlist);
             foreach ($bases as $path => $baserev) {
                 if ($baserev !== $rev) {
                     throw new ArcanistUsageException(pht("Base revisions of changed paths are mismatched. Update all " . "paths to the same base revision before creating a diff: " . "\n\n%s", $revlist));
                 }
             }
             // If you have a change which affects several files, all of which are
             // at a consistent base revision, treat that revision as the effective
             // base revision. The use case here is that you made a change to some
             // file, which updates it to HEAD, but want to be able to change it
             // again without updating the entire working copy. This is a little
             // sketchy but it arises in Facebook Ops workflows with config files and
             // doesn't have any real material tradeoffs (e.g., these patches are
             // perfectly applyable).
             $repository_api->overrideSVNBaseRevisionNumber($rev);
         }
         $changes = $parser->parseSubversionDiff($repository_api, $paths);
     } else {
         if ($repository_api instanceof ArcanistGitAPI) {
             $diff = $repository_api->getFullGitDiff($repository_api->getBaseCommit(), $repository_api->getHeadCommit());
             if (!strlen($diff)) {
                 throw new ArcanistUsageException(pht('No changes found. (Did you specify the wrong commit range?)'));
             }
             $changes = $parser->parseDiff($diff);
         } else {
             if ($repository_api instanceof ArcanistMercurialAPI) {
                 $diff = $repository_api->getFullMercurialDiff();
                 if (!strlen($diff)) {
                     throw new ArcanistUsageException(pht('No changes found. (Did you specify the wrong commit range?)'));
                 }
                 $changes = $parser->parseDiff($diff);
             } else {
                 throw new Exception(pht('Repository API is not supported.'));
             }
         }
     }
     if (count($changes) > 250) {
         $message = pht('This diff has a very large number of changes (%s). Differential ' . 'works best for changes which will receive detailed human review, ' . 'and not as well for large automated changes or bulk checkins. ' . 'See %s for information about reviewing big checkins. Continue anyway?', phutil_count($changes), 'https://secure.phabricator.com/book/phabricator/article/' . 'differential_large_changes/');
         if (!phutil_console_confirm($message)) {
             throw new ArcanistUsageException(pht('Aborted generation of gigantic diff.'));
         }
     }
     $limit = 1024 * 1024 * 4;
     foreach ($changes as $change) {
         $size = 0;
         foreach ($change->getHunks() as $hunk) {
             $size += strlen($hunk->getCorpus());
         }
         if ($size > $limit) {
             $byte_warning = pht("Diff for '%s' with context is %s bytes in length. " . "Generally, source changes should not be this large.", $change->getCurrentPath(), new PhutilNumber($size));
             if (!$this->getArgument('less-context')) {
                 $byte_warning .= ' ' . pht("If this file is a huge text file, try using the '%s' flag.", '--less-context');
             }
             if ($repository_api instanceof ArcanistSubversionAPI) {
                 throw new ArcanistUsageException($byte_warning . ' ' . pht("If the file is not a text file, mark it as binary with:" . "\n\n  \$ %s\n", 'svn propset svn:mime-type application/octet-stream <filename>'));
             } else {
                 $confirm = $byte_warning . ' ' . pht("If the file is not a text file, you can mark it 'binary'. " . "Mark this file as 'binary' and continue?");
                 if (phutil_console_confirm($confirm)) {
                     $change->convertToBinaryChange($repository_api);
                 } else {
                     throw new ArcanistUsageException(pht('Aborted generation of gigantic diff.'));
                 }
             }
         }
     }
     $try_encoding = nonempty($this->getArgument('encoding'), null);
     $utf8_problems = array();
     foreach ($changes as $change) {
         foreach ($change->getHunks() as $hunk) {
             $corpus = $hunk->getCorpus();
             if (!phutil_is_utf8($corpus)) {
                 // If this corpus is heuristically binary, don't try to convert it.
                 // mb_check_encoding() and mb_convert_encoding() are both very very
                 // liberal about what they're willing to process.
                 $is_binary = ArcanistDiffUtils::isHeuristicBinaryFile($corpus);
                 if (!$is_binary) {
                     if (!$try_encoding) {
                         try {
                             $try_encoding = $this->getRepositoryEncoding();
                         } catch (ConduitClientException $e) {
                             if ($e->getErrorCode() == 'ERR-BAD-ARCANIST-PROJECT') {
                                 echo phutil_console_wrap(pht('Lookup of encoding in arcanist project failed: %s', $e->getMessage()) . "\n");
                             } else {
                                 throw $e;
                             }
                         }
                     }
                     if ($try_encoding) {
                         $corpus = phutil_utf8_convert($corpus, 'UTF-8', $try_encoding);
                         $name = $change->getCurrentPath();
                         if (phutil_is_utf8($corpus)) {
                             $this->writeStatusMessage(pht("Converted a '%s' hunk from '%s' to UTF-8.\n", $name, $try_encoding));
                             $hunk->setCorpus($corpus);
                             continue;
                         }
                     }
                 }
                 $utf8_problems[] = $change;
                 break;
             }
         }
     }
     // If there are non-binary files which aren't valid UTF-8, warn the user
     // and treat them as binary changes. See D327 for discussion of why Arcanist
     // has this behavior.
     if ($utf8_problems) {
         $utf8_warning = sprintf("%s\n\n%s\n\n    %s\n", pht('This diff includes %s file(s) which are not valid UTF-8 (they ' . 'contain invalid byte sequences). You can either stop this ' . 'workflow and fix these files, or continue. If you continue, ' . 'these files will be marked as binary.', phutil_count($utf8_problems)), pht("You can learn more about how Phabricator handles character " . "encodings (and how to configure encoding settings and detect and " . "correct encoding problems) by reading 'User Guide: UTF-8 and " . "Character Encoding' in the Phabricator documentation."), pht('%s AFFECTED FILE(S)', phutil_count($utf8_problems)));
         $confirm = pht('Do you want to mark these %s file(s) as binary and continue?', phutil_count($utf8_problems));
         echo phutil_console_format("**%s**\n", pht('Invalid Content Encoding (Non-UTF8)'));
         echo phutil_console_wrap($utf8_warning);
         $file_list = mpull($utf8_problems, 'getCurrentPath');
         $file_list = '    ' . implode("\n    ", $file_list);
         echo $file_list;
         if (!phutil_console_confirm($confirm, $default_no = false)) {
             throw new ArcanistUsageException(pht('Aborted workflow to fix UTF-8.'));
         } else {
             foreach ($utf8_problems as $change) {
                 $change->convertToBinaryChange($repository_api);
             }
         }
     }
     $this->uploadFilesForChanges($changes);
     return $changes;
 }
Example #5
0
/**
 * Produce a human-readable explanation why a value can not be JSON-encoded.
 *
 * @param wild Value to validate.
 * @param string Path within the object to provide context.
 * @return string|null Explanation of why it can't be encoded, or null.
 */
function phutil_validate_json($value, $path = '')
{
    if ($value === null) {
        return;
    }
    if ($value === true) {
        return;
    }
    if ($value === false) {
        return;
    }
    if (is_int($value)) {
        return;
    }
    if (is_float($value)) {
        return;
    }
    if (is_array($value)) {
        foreach ($value as $key => $subvalue) {
            if (strlen($path)) {
                $full_key = $path . ' > ';
            } else {
                $full_key = '';
            }
            if (!phutil_is_utf8($key)) {
                $full_key = $full_key . phutil_utf8ize($key);
                return pht('Dictionary key "%s" is not valid UTF8, and can not be JSON encoded.', $full_key);
            }
            $full_key .= $key;
            $result = phutil_validate_json($subvalue, $full_key);
            if ($result !== null) {
                return $result;
            }
        }
    }
    if (is_string($value)) {
        if (!phutil_is_utf8($value)) {
            $display = substr($value, 0, 256);
            $display = phutil_utf8ize($display);
            if (!strlen($path)) {
                return pht('String value is not valid UTF8, and can not be JSON encoded: %s', $display);
            } else {
                return pht('Dictionary value at key "%s" is not valid UTF8, and can not be ' . 'JSON encoded: %s', $path, $display);
            }
        }
    }
    return;
}
Example #6
0
 protected function generateChanges()
 {
     $parser = $this->newDiffParser();
     $is_raw = $this->isRawDiffSource();
     if ($is_raw) {
         if ($this->getArgument('raw')) {
             file_put_contents('php://stderr', "Reading diff from stdin...\n");
             $raw_diff = file_get_contents('php://stdin');
         } else {
             if ($this->getArgument('raw-command')) {
                 list($raw_diff) = execx($this->getArgument('raw-command'));
             } else {
                 throw new Exception("Unknown raw diff source.");
             }
         }
         $changes = $parser->parseDiff($raw_diff);
         foreach ($changes as $key => $change) {
             // Remove "message" changes, e.g. from "git show".
             if ($change->getType() == ArcanistDiffChangeType::TYPE_MESSAGE) {
                 unset($changes[$key]);
             }
         }
         return $changes;
     }
     $repository_api = $this->getRepositoryAPI();
     if ($repository_api instanceof ArcanistSubversionAPI) {
         $paths = $this->generateAffectedPaths();
         $this->primeSubversionWorkingCopyData($paths);
         // Check to make sure the user is diffing from a consistent base revision.
         // This is mostly just an abuse sanity check because it's silly to do this
         // and makes the code more difficult to effectively review, but it also
         // affects patches and makes them nonportable.
         $bases = $repository_api->getSVNBaseRevisions();
         // Remove all files with baserev "0"; these files are new.
         foreach ($bases as $path => $baserev) {
             if ($bases[$path] <= 0) {
                 unset($bases[$path]);
             }
         }
         if ($bases) {
             $rev = reset($bases);
             $revlist = array();
             foreach ($bases as $path => $baserev) {
                 $revlist[] = "    Revision {$baserev}, {$path}";
             }
             $revlist = implode("\n", $revlist);
             foreach ($bases as $path => $baserev) {
                 if ($baserev !== $rev) {
                     throw new ArcanistUsageException("Base revisions of changed paths are mismatched. Update all " . "paths to the same base revision before creating a diff: " . "\n\n" . $revlist);
                 }
             }
             // If you have a change which affects several files, all of which are
             // at a consistent base revision, treat that revision as the effective
             // base revision. The use case here is that you made a change to some
             // file, which updates it to HEAD, but want to be able to change it
             // again without updating the entire working copy. This is a little
             // sketchy but it arises in Facebook Ops workflows with config files and
             // doesn't have any real material tradeoffs (e.g., these patches are
             // perfectly applyable).
             $repository_api->overrideSVNBaseRevisionNumber($rev);
         }
         $changes = $parser->parseSubversionDiff($repository_api, $paths);
     } else {
         if ($repository_api instanceof ArcanistGitAPI) {
             $diff = $repository_api->getFullGitDiff();
             if (!strlen($diff)) {
                 throw new ArcanistUsageException("No changes found. (Did you specify the wrong commit range?)");
             }
             $changes = $parser->parseDiff($diff);
         } else {
             if ($repository_api instanceof ArcanistMercurialAPI) {
                 $diff = $repository_api->getFullMercurialDiff();
                 if (!strlen($diff)) {
                     throw new ArcanistUsageException("No changes found. (Did you specify the wrong commit range?)");
                 }
                 $changes = $parser->parseDiff($diff);
             } else {
                 throw new Exception("Repository API is not supported.");
             }
         }
     }
     if (count($changes) > 250) {
         $count = number_format(count($changes));
         $message = "This diff has a very large number of changes ({$count}). " . "Differential works best for changes which will receive detailed " . "human review, and not as well for large automated changes or " . "bulk checkins. Continue anyway?";
         if (!phutil_console_confirm($message)) {
             throw new ArcanistUsageException("Aborted generation of gigantic diff.");
         }
     }
     $limit = 1024 * 1024 * 4;
     foreach ($changes as $change) {
         $size = 0;
         foreach ($change->getHunks() as $hunk) {
             $size += strlen($hunk->getCorpus());
         }
         if ($size > $limit) {
             $file_name = $change->getCurrentPath();
             $change_size = number_format($size);
             $byte_warning = "Diff for '{$file_name}' with context is {$change_size} bytes in " . "length. Generally, source changes should not be this large.";
             if (!$this->getArgument('less-context')) {
                 $byte_warning .= " If this file is a huge text file, try using the " . "'--less-context' flag.";
             }
             if ($repository_api instanceof ArcanistSubversionAPI) {
                 throw new ArcanistUsageException("{$byte_warning} If the file is not a text file, mark it as " . "binary with:" . "\n\n" . "  \$ svn propset svn:mime-type application/octet-stream <filename>" . "\n");
             } else {
                 $confirm = "{$byte_warning} If the file is not a text file, you can " . "mark it 'binary'. Mark this file as 'binary' and continue?";
                 if (phutil_console_confirm($confirm)) {
                     $change->convertToBinaryChange();
                 } else {
                     throw new ArcanistUsageException("Aborted generation of gigantic diff.");
                 }
             }
         }
     }
     $try_encoding = nonempty($this->getArgument('encoding'), null);
     $utf8_problems = array();
     foreach ($changes as $change) {
         foreach ($change->getHunks() as $hunk) {
             $corpus = $hunk->getCorpus();
             if (!phutil_is_utf8($corpus)) {
                 // If this corpus is heuristically binary, don't try to convert it.
                 // mb_check_encoding() and mb_convert_encoding() are both very very
                 // liberal about what they're willing to process.
                 $is_binary = ArcanistDiffUtils::isHeuristicBinaryFile($corpus);
                 if (!$is_binary) {
                     if (!$try_encoding) {
                         try {
                             $try_encoding = $this->getRepositoryEncoding();
                         } catch (ConduitClientException $e) {
                             if ($e->getErrorCode() == 'ERR-BAD-ARCANIST-PROJECT') {
                                 echo phutil_console_wrap("Lookup of encoding in arcanist project failed\n" . $e->getMessage());
                             } else {
                                 throw $e;
                             }
                         }
                     }
                     if ($try_encoding && $try_encoding != 'UTF-8') {
                         if (!function_exists('mb_convert_encoding')) {
                             throw new ArcanistUsageException("This diff includes a file encoded in '{$try_encoding}', " . "but you don't have the PHP mbstring extension installed " . "so it can't be converted to UTF-8. Install mbstring.");
                         }
                         $corpus = mb_convert_encoding($corpus, 'UTF-8', $try_encoding);
                         $name = $change->getCurrentPath();
                         if (phutil_is_utf8($corpus)) {
                             $this->writeStatusMessage("Converted a '{$name}' hunk from '{$try_encoding}' " . "to UTF-8.\n");
                             $hunk->setCorpus($corpus);
                             continue;
                         }
                     }
                 }
                 $utf8_problems[] = $change;
                 break;
             }
         }
     }
     // If there are non-binary files which aren't valid UTF-8, warn the user
     // and treat them as binary changes. See D327 for discussion of why Arcanist
     // has this behavior.
     if ($utf8_problems) {
         $utf8_warning = pht("This diff includes file(s) which are not valid UTF-8 (they contain " . "invalid byte sequences). You can either stop this workflow and " . "fix these files, or continue. If you continue, these files will " . "be marked as binary.", count($utf8_problems)) . "\n\n" . "You can learn more about how Phabricator handles character encodings " . "(and how to configure encoding settings and detect and correct " . "encoding problems) by reading 'User Guide: UTF-8 and Character " . "Encoding' in the Phabricator documentation.\n\n";
         "    " . pht('AFFECTED FILE(S)', count($utf8_problems)) . "\n";
         $confirm = pht('Do you want to mark these files as binary and continue?', count($utf8_problems));
         echo phutil_console_format("**Invalid Content Encoding (Non-UTF8)**\n");
         echo phutil_console_wrap($utf8_warning);
         $file_list = mpull($utf8_problems, 'getCurrentPath');
         $file_list = '    ' . implode("\n    ", $file_list);
         echo $file_list;
         if (!phutil_console_confirm($confirm, $default_no = false)) {
             throw new ArcanistUsageException("Aborted workflow to fix UTF-8.");
         } else {
             foreach ($utf8_problems as $change) {
                 $change->convertToBinaryChange();
             }
         }
     }
     foreach ($changes as $change) {
         $path = $change->getCurrentPath();
         // Certain types of changes (moves and copies) don't contain change data
         // when expressed in raw "git diff" form. Augment any such diffs with
         // textual data.
         if ($change->getNeedsSyntheticGitHunks()) {
             $diff = $repository_api->getRawDiffText($path, $moves = false);
             $parser = $this->newDiffParser();
             $raw_changes = $parser->parseDiff($diff);
             foreach ($raw_changes as $raw_change) {
                 if ($raw_change->getCurrentPath() == $path) {
                     $change->setFileType($raw_change->getFileType());
                     foreach ($raw_change->getHunks() as $hunk) {
                         $change->addHunk($hunk);
                     }
                     break;
                 }
             }
             $change->setNeedsSyntheticGitHunks(false);
         }
         if ($change->getFileType() != ArcanistDiffChangeType::FILE_BINARY) {
             continue;
         }
         $name = basename($path);
         $old_file = $repository_api->getOriginalFileData($path);
         $old_dict = $this->uploadFile($old_file, $name, 'old binary');
         if ($old_dict['guid']) {
             $change->setMetadata('old:binary-phid', $old_dict['guid']);
         }
         $change->setMetadata('old:file:size', $old_dict['size']);
         $change->setMetadata('old:file:mime-type', $old_dict['mime']);
         $new_file = $repository_api->getCurrentFileData($path);
         $new_dict = $this->uploadFile($new_file, $name, 'new binary');
         if ($new_dict['guid']) {
             $change->setMetadata('new:binary-phid', $new_dict['guid']);
         }
         $change->setMetadata('new:file:size', $new_dict['size']);
         $change->setMetadata('new:file:mime-type', $new_dict['mime']);
         if (preg_match('@^image/@', $new_dict['mime'])) {
             $change->setFileType(ArcanistDiffChangeType::FILE_IMAGE);
         }
     }
     return $changes;
 }
 protected function detectEncodingForStorage($string)
 {
     return phutil_is_utf8($string) ? 'utf8' : null;
 }
 protected function parseChangeset(ArcanistDiffChange $change)
 {
     // If a diff includes two sets of changes to the same file, let the
     // second one win. In particular, this occurs when adding subdirectories
     // in Subversion that contain files: the file text will be present in
     // both the directory diff and the file diff. See T5555. Dropping the
     // hunks lets whichever one shows up later win instead of showing changes
     // twice.
     $change->dropHunks();
     $all_changes = array();
     do {
         $hunk = new ArcanistDiffHunk();
         $line = $this->getLineTrimmed();
         $real = array();
         // In the case where only one line is changed, the length is omitted.
         // The final group is for git, which appends a guess at the function
         // context to the diff.
         $matches = null;
         $ok = preg_match('/^@@ -(\\d+)(?:,(\\d+))? \\+(\\d+)(?:,(\\d+))? @@(?: .*?)?$/U', $line, $matches);
         if (!$ok) {
             // It's possible we hit the style of an svn1.7 property change.
             // This is a 4-line Index block, followed by an empty line, followed
             // by a "Property changes on:" section similar to svn1.6.
             if ($line == '') {
                 $line = $this->nextNonemptyLine();
                 $ok = preg_match('/^Property changes on:/', $line);
                 if (!$ok) {
                     $this->didFailParse(pht('Confused by empty line'));
                 }
                 $line = $this->nextLine();
                 return $this->parsePropertyHunk($change);
             }
             $this->didFailParse(pht("Expected hunk header '%s'.", '@@ -NN,NN +NN,NN @@'));
         }
         $hunk->setOldOffset($matches[1]);
         $hunk->setNewOffset($matches[3]);
         // Cover for the cases where length wasn't present (implying one line).
         $old_len = idx($matches, 2);
         if (!strlen($old_len)) {
             $old_len = 1;
         }
         $new_len = idx($matches, 4);
         if (!strlen($new_len)) {
             $new_len = 1;
         }
         $hunk->setOldLength($old_len);
         $hunk->setNewLength($new_len);
         $add = 0;
         $del = 0;
         $hit_next_hunk = false;
         while (($line = $this->nextLine()) !== null) {
             if (strlen(rtrim($line, "\r\n"))) {
                 $char = $line[0];
             } else {
                 // Normally, we do not encouter empty lines in diffs, because
                 // unchanged lines have an initial space. However, in Git, with
                 // the option `diff.suppress-blank-empty` set, unchanged blank lines
                 // emit as completely empty. If we encounter a completely empty line,
                 // treat it as a ' ' (i.e., unchanged empty line) line.
                 $char = ' ';
             }
             switch ($char) {
                 case '\\':
                     if (!preg_match('@\\ No newline at end of file@', $line)) {
                         $this->didFailParse(pht("Expected '\\ No newline at end of file'."));
                     }
                     if ($new_len) {
                         $real[] = $line;
                         $hunk->setIsMissingOldNewline(true);
                     } else {
                         $real[] = $line;
                         $hunk->setIsMissingNewNewline(true);
                     }
                     if (!$new_len) {
                         break 2;
                     }
                     break;
                 case '+':
                     ++$add;
                     --$new_len;
                     $real[] = $line;
                     break;
                 case '-':
                     if (!$old_len) {
                         // In this case, we've hit "---" from a new file. So don't
                         // advance the line cursor.
                         $hit_next_hunk = true;
                         break 2;
                     }
                     ++$del;
                     --$old_len;
                     $real[] = $line;
                     break;
                 case ' ':
                     if (!$old_len && !$new_len) {
                         break 2;
                     }
                     --$old_len;
                     --$new_len;
                     $real[] = $line;
                     break;
                 default:
                     // We hit something, likely another hunk.
                     $hit_next_hunk = true;
                     break 2;
             }
         }
         if ($old_len || $new_len) {
             $this->didFailParse(pht('Found the wrong number of hunk lines.'));
         }
         $corpus = implode('', $real);
         $is_binary = false;
         if ($this->detectBinaryFiles) {
             $is_binary = !phutil_is_utf8($corpus);
             $try_encoding = $this->tryEncoding;
             if ($is_binary && $try_encoding) {
                 $is_binary = ArcanistDiffUtils::isHeuristicBinaryFile($corpus);
                 if (!$is_binary) {
                     $corpus = phutil_utf8_convert($corpus, 'UTF-8', $try_encoding);
                     if (!phutil_is_utf8($corpus)) {
                         throw new Exception(pht("Failed to convert a hunk from '%s' to UTF-8. " . "Check that the specified encoding is correct.", $try_encoding));
                     }
                 }
             }
         }
         if ($is_binary) {
             // SVN happily treats binary files which aren't marked with the right
             // mime type as text files. Detect that junk here and mark the file
             // binary. We'll catch stuff with unicode too, but that's verboten
             // anyway. If there are too many false positives with this we might
             // need to make it threshold-triggered instead of triggering on any
             // unprintable byte.
             $change->setFileType(ArcanistDiffChangeType::FILE_BINARY);
         } else {
             $hunk->setCorpus($corpus);
             $hunk->setAddLines($add);
             $hunk->setDelLines($del);
             $change->addHunk($hunk);
         }
         if (!$hit_next_hunk) {
             $line = $this->nextNonemptyLine();
         }
     } while (preg_match('/^@@ /', $line));
 }
 public function testUTF8BMP()
 {
     $tests = array("" => array(true, true, "empty string"), "a" => array(true, true, "a"), "a͠͠" => array(true, true, "a with combining"), "☃" => array(true, true, "snowman"), "￿" => array(true, true, "U+FFFF"), "��" => array(false, false, "Invalid, byte range."), "𐀀" => array(true, false, "U+10000"), "𝄞" => array(true, false, "gclef"), "musical 𝄞 g-clef" => array(true, false, "gclef text"), "�" => array(false, false, "Invalid, truncated."), "���" => array(false, false, "Nonminimal 3-byte character."), "�" => array(false, false, "Partial 2-byte character."), "�" => array(false, false, "Partial BMP 0xE0 character."), "�" => array(false, false, "Partial BMP cahracter."));
     foreach ($tests as $input => $test) {
         list($expect_utf8, $expect_bmp, $test_name) = $test;
         $this->assertEqual($expect_utf8, phutil_is_utf8($input), pht('is_utf(%s)', $test_name));
         $this->assertEqual($expect_bmp, phutil_is_utf8_with_only_bmp_characters($input), pht('is_utf_bmp(%s)', $test_name));
     }
 }
 private function buildCorpus($selected, $file_query, $needs_blame, $drequest, $path, $data)
 {
     $image_type = $this->getImageType($path);
     if ($image_type && !$selected) {
         $corpus = phutil_render_tag('img', array('style' => 'padding-bottom: 10px', 'src' => 'data:' . $image_type . ';base64,' . base64_encode($data)));
         return $corpus;
     }
     $document_type = $this->getDocumentType($path);
     if ($document_type && !$selected || !phutil_is_utf8($data)) {
         $data = $file_query->getRawData();
         $document_type_description = $document_type ? $document_type : 'binary';
         $corpus = phutil_render_tag('p', array('style' => 'text-align: center;'), phutil_render_tag('a', array('href' => '?view=raw', 'class' => 'button'), "View {$document_type_description}"));
         return $corpus;
     }
     // TODO: blame of blame.
     switch ($selected) {
         case 'plain':
             $style = "margin: 1em 2em; width: 90%; height: 80em; font-family: monospace";
             $corpus = phutil_render_tag('textarea', array('style' => $style), phutil_escape_html($file_query->getRawData()));
             break;
         case 'plainblame':
             $style = "margin: 1em 2em; width: 90%; height: 80em; font-family: monospace";
             list($text_list, $rev_list, $blame_dict) = $file_query->getBlameData();
             $rows = array();
             foreach ($text_list as $k => $line) {
                 $rev = $rev_list[$k];
                 if (isset($blame_dict[$rev]['handle'])) {
                     $author = $blame_dict[$rev]['handle']->getName();
                 } else {
                     $author = $blame_dict[$rev]['author'];
                 }
                 $rows[] = sprintf("%-10s %-20s %s", substr($rev, 0, 7), $author, $line);
             }
             $corpus = phutil_render_tag('textarea', array('style' => $style), phutil_escape_html(implode("\n", $rows)));
             break;
         case 'highlighted':
         case 'blame':
         default:
             require_celerity_resource('syntax-highlighting-css');
             list($text_list, $rev_list, $blame_dict) = $file_query->getBlameData();
             $text_list = implode("\n", $text_list);
             $text_list = PhabricatorSyntaxHighlighter::highlightWithFilename($path, $text_list);
             $text_list = explode("\n", $text_list);
             $rows = $this->buildDisplayRows($text_list, $rev_list, $blame_dict, $needs_blame, $drequest, $file_query, $selected);
             $corpus_table = phutil_render_tag('table', array('class' => "diffusion-source remarkup-code PhabricatorMonospaced"), implode("\n", $rows));
             $corpus = phutil_render_tag('div', array('style' => 'padding: 0pt 2em;'), $corpus_table);
             break;
     }
     return $corpus;
 }
Example #11
0
function show(array $files, $context)
{
    foreach ($files as $file) {
        $data = read($file);
        $ok = phutil_is_utf8($data);
        if ($ok) {
            echo "OKAY";
        } else {
            echo "FAIL";
        }
        echo "  " . name($file) . "\n";
        if (!$ok) {
            $lines = explode("\n", $data);
            $len = count($lines);
            $map = array();
            $bad = array();
            foreach ($lines as $n => $line) {
                if (phutil_is_utf8($line)) {
                    continue;
                }
                $bad[$n] = true;
                for ($jj = max(0, $n - $context); $jj < min($len, $n + 1 + $context); $jj++) {
                    $map[$jj] = true;
                }
            }
            $width = strlen(max(array_keys($map)));
            // Set $last such that we print a newline on the first iteration thorugh
            // the loop.
            $last = -2;
            foreach ($map as $idx => $ignored) {
                if ($idx != $last + 1) {
                    printf("\n");
                }
                $last = $idx;
                $line = $lines[$idx];
                if (!empty($bad[$idx])) {
                    $line = show_problems($line);
                }
                printf("  % {$width}d  %s\n", $idx + 1, $line);
            }
            echo "\n";
        }
    }
    return 0;
}
Example #12
0
 protected function parseChangeset(ArcanistDiffChange $change)
 {
     $all_changes = array();
     do {
         $hunk = new ArcanistDiffHunk();
         $line = $this->getLine();
         $real = array();
         // In the case where only one line is changed, the length is omitted.
         // The final group is for git, which appends a guess at the function
         // context to the diff.
         $matches = null;
         $ok = preg_match('/^@@ -(\\d+)(?:,(\\d+))? \\+(\\d+)(?:,(\\d+))? @@(?: .*?)?$/U', $line, $matches);
         if (!$ok) {
             // It's possible we hit the style of an svn1.7 property change.
             // This is a 4-line Index block, followed by an empty line, followed
             // by a "Property changes on:" section similar to svn1.6.
             if ($line == '') {
                 $line = $this->nextNonemptyLine();
                 $ok = preg_match('/^Property changes on:/', $line);
                 if (!$ok) {
                     $this->didFailParse("Confused by empty line");
                 }
                 $line = $this->nextLine();
                 return $this->parsePropertyHunk($change);
             }
             $this->didFailParse("Expected hunk header '@@ -NN,NN +NN,NN @@'.");
         }
         $hunk->setOldOffset($matches[1]);
         $hunk->setNewOffset($matches[3]);
         // Cover for the cases where length wasn't present (implying one line).
         $old_len = idx($matches, 2);
         if (!strlen($old_len)) {
             $old_len = 1;
         }
         $new_len = idx($matches, 4);
         if (!strlen($new_len)) {
             $new_len = 1;
         }
         $hunk->setOldLength($old_len);
         $hunk->setNewLength($new_len);
         $add = 0;
         $del = 0;
         $advance = false;
         while (($line = $this->nextLine()) !== null) {
             if (strlen($line)) {
                 $char = $line[0];
             } else {
                 $char = '~';
             }
             switch ($char) {
                 case '\\':
                     if (!preg_match('@\\ No newline at end of file@', $line)) {
                         $this->didFailParse("Expected '\\ No newline at end of file'.");
                     }
                     if ($new_len) {
                         $real[] = $line;
                         $hunk->setIsMissingOldNewline(true);
                     } else {
                         $real[] = $line;
                         $hunk->setIsMissingNewNewline(true);
                     }
                     if (!$new_len) {
                         $advance = true;
                         break 2;
                     }
                     break;
                 case '+':
                     if (!$new_len) {
                         break 2;
                     }
                     ++$add;
                     --$new_len;
                     $real[] = $line;
                     break;
                 case '-':
                     if (!$old_len) {
                         break 2;
                     }
                     ++$del;
                     --$old_len;
                     $real[] = $line;
                     break;
                 case ' ':
                     if (!$old_len && !$new_len) {
                         break 2;
                     }
                     --$old_len;
                     --$new_len;
                     $real[] = $line;
                     break;
                 case '~':
                     $advance = true;
                     break 2;
                 default:
                     break 2;
             }
         }
         if ($old_len != 0 || $new_len != 0) {
             $this->didFailParse("Found the wrong number of hunk lines.");
         }
         $corpus = implode("\n", $real);
         $is_binary = false;
         if ($this->detectBinaryFiles) {
             $is_binary = !phutil_is_utf8($corpus);
             if ($is_binary && $this->tryEncoding) {
                 $is_binary = ArcanistDiffUtils::isHeuristicBinaryFile($corpus);
                 if (!$is_binary) {
                     // NOTE: This feature is HIGHLY EXPERIMENTAL and will cause a lot
                     // of issues. Use it at your own risk.
                     $corpus = mb_convert_encoding($corpus, 'UTF-8', $this->tryEncoding);
                     if (!phutil_is_utf8($corpus)) {
                         throw new Exception('Failed converting hunk to ' . $this->tryEncoding);
                     }
                 }
             }
         }
         if ($is_binary) {
             // SVN happily treats binary files which aren't marked with the right
             // mime type as text files. Detect that junk here and mark the file
             // binary. We'll catch stuff with unicode too, but that's verboten
             // anyway. If there are too many false positives with this we might
             // need to make it threshold-triggered instead of triggering on any
             // unprintable byte.
             $change->setFileType(ArcanistDiffChangeType::FILE_BINARY);
         } else {
             $hunk->setCorpus($corpus);
             $hunk->setAddLines($add);
             $hunk->setDelLines($del);
             $change->addHunk($hunk);
         }
         if ($advance) {
             $line = $this->nextNonemptyLine();
         }
     } while (preg_match('/^@@ /', $line));
 }
 public function testUTF8BMP()
 {
     $tests = array('' => array(true, true, pht('empty string')), 'a' => array(true, true, 'a'), "a͠͠" => array(true, true, pht('%s with combining', 'a')), "☃" => array(true, true, pht('snowman')), "￿" => array(true, true, 'U+FFFF'), "��" => array(false, false, pht('Invalid, byte range.')), "����" => array(false, false, pht('Nonminimal 4-byte character.')), "𐀀" => array(true, false, 'U+10000'), "𝄞" => array(true, false, 'gclef'), "musical 𝄞 g-clef" => array(true, false, pht('gclef text')), "�" => array(false, false, pht('Invalid, truncated.')), "���" => array(false, false, pht('Nonminimal 3-byte character.')), "�" => array(false, false, pht('Partial 2-byte character.')), "�" => array(false, false, pht('Partial BMP 0xE0 character.')), "�" => array(false, false, pht('Partial BMP cahracter.')));
     foreach ($tests as $input => $test) {
         list($expect_utf8, $expect_bmp, $test_name) = $test;
         // Depending on what's installed on the system, this may use an
         // extension.
         $this->assertEqual($expect_utf8, phutil_is_utf8($input), pht('is_utf(%s)', $test_name));
         // Also test this against the pure PHP implementation, explicitly.
         $this->assertEqual($expect_utf8, phutil_is_utf8_slowly($input), pht('is_utf_slowly(%s)', $test_name));
         $this->assertEqual($expect_bmp, phutil_is_utf8_with_only_bmp_characters($input), pht('is_utf_bmp(%s)', $test_name));
     }
 }