private function breakHunkIntoSmallHunks(ArcanistDiffHunk $base_hunk) { $context = 3; $results = array(); $lines = phutil_split_lines($base_hunk->getCorpus()); $n = count($lines); $old_offset = $base_hunk->getOldOffset(); $new_offset = $base_hunk->getNewOffset(); $ii = 0; $jj = 0; while ($ii < $n) { // Skip lines until we find the next line with changes. Note: this skips // both ' ' (no changes) and '\' (no newline at end of file) lines. If we // don't skip the latter, we may incorrectly generate a terminal hunk // that has no actual change information when a file doesn't have a // terminal newline and not changed near the end of the file. 'patch' will // fail to apply the diff if we generate a hunk that does not actually // contain changes. for ($jj = $ii; $jj < $n; ++$jj) { $char = $lines[$jj][0]; if ($char == '-' || $char == '+') { break; } } if ($jj >= $n) { break; } $hunk_start = max($jj - $context, 0); // NOTE: There are two tricky considerations here. // We can not generate a patch with overlapping hunks, or 'git apply' // rejects it after 1.7.3.4. // We can not generate a patch with too much trailing context, or // 'patch' rejects it. // So we need to ensure that we generate disjoint hunks, but don't // generate any hunks with too much context. $old_lines = 0; $new_lines = 0; $hunk_adjust = 0; $last_change = $jj; $break_here = null; for (; $jj < $n; ++$jj) { if ($lines[$jj][0] == ' ') { if ($jj - $last_change > $context) { if ($break_here === null) { // We haven't seen a change in $context lines, so this is a // potential place to break the hunk. However, we need to keep // looking in case there is another change fewer than $context // lines away, in which case we have to merge the hunks. $break_here = $jj; } } if ($jj - $last_change > ($context + 1) * 2) { // We definitely aren't going to merge this with the next hunk, so // break out of the loop. We'll end the hunk at $break_here. break; } } else { $break_here = null; $last_change = $jj; if ($lines[$jj][0] == '\\') { // When we have a "\ No newline at end of file" line, it does not // contribute to either hunk length. ++$hunk_adjust; } else { if ($lines[$jj][0] == '-') { ++$old_lines; } else { if ($lines[$jj][0] == '+') { ++$new_lines; } } } } } if ($break_here !== null) { $jj = $break_here; } $hunk_length = min($jj, $n) - $hunk_start; $count_length = $hunk_length - $hunk_adjust; $hunk = new ArcanistDiffHunk(); $hunk->setOldOffset($old_offset + $hunk_start - $ii); $hunk->setNewOffset($new_offset + $hunk_start - $ii); $hunk->setOldLength($count_length - $new_lines); $hunk->setNewLength($count_length - $old_lines); $corpus = array_slice($lines, $hunk_start, $hunk_length); $corpus = implode('', $corpus); $hunk->setCorpus($corpus); $results[] = $hunk; $old_offset += $jj - $ii - $new_lines; $new_offset += $jj - $ii - $old_lines; $ii = $jj; } return $results; }
protected function parseChangeset(ArcanistDiffChange $change) { // If a diff includes two sets of changes to the same file, let the // second one win. In particular, this occurs when adding subdirectories // in Subversion that contain files: the file text will be present in // both the directory diff and the file diff. See T5555. Dropping the // hunks lets whichever one shows up later win instead of showing changes // twice. $change->dropHunks(); $all_changes = array(); do { $hunk = new ArcanistDiffHunk(); $line = $this->getLineTrimmed(); $real = array(); // In the case where only one line is changed, the length is omitted. // The final group is for git, which appends a guess at the function // context to the diff. $matches = null; $ok = preg_match('/^@@ -(\\d+)(?:,(\\d+))? \\+(\\d+)(?:,(\\d+))? @@(?: .*?)?$/U', $line, $matches); if (!$ok) { // It's possible we hit the style of an svn1.7 property change. // This is a 4-line Index block, followed by an empty line, followed // by a "Property changes on:" section similar to svn1.6. if ($line == '') { $line = $this->nextNonemptyLine(); $ok = preg_match('/^Property changes on:/', $line); if (!$ok) { $this->didFailParse(pht('Confused by empty line')); } $line = $this->nextLine(); return $this->parsePropertyHunk($change); } $this->didFailParse(pht("Expected hunk header '%s'.", '@@ -NN,NN +NN,NN @@')); } $hunk->setOldOffset($matches[1]); $hunk->setNewOffset($matches[3]); // Cover for the cases where length wasn't present (implying one line). $old_len = idx($matches, 2); if (!strlen($old_len)) { $old_len = 1; } $new_len = idx($matches, 4); if (!strlen($new_len)) { $new_len = 1; } $hunk->setOldLength($old_len); $hunk->setNewLength($new_len); $add = 0; $del = 0; $hit_next_hunk = false; while (($line = $this->nextLine()) !== null) { if (strlen(rtrim($line, "\r\n"))) { $char = $line[0]; } else { // Normally, we do not encouter empty lines in diffs, because // unchanged lines have an initial space. However, in Git, with // the option `diff.suppress-blank-empty` set, unchanged blank lines // emit as completely empty. If we encounter a completely empty line, // treat it as a ' ' (i.e., unchanged empty line) line. $char = ' '; } switch ($char) { case '\\': if (!preg_match('@\\ No newline at end of file@', $line)) { $this->didFailParse(pht("Expected '\\ No newline at end of file'.")); } if ($new_len) { $real[] = $line; $hunk->setIsMissingOldNewline(true); } else { $real[] = $line; $hunk->setIsMissingNewNewline(true); } if (!$new_len) { break 2; } break; case '+': ++$add; --$new_len; $real[] = $line; break; case '-': if (!$old_len) { // In this case, we've hit "---" from a new file. So don't // advance the line cursor. $hit_next_hunk = true; break 2; } ++$del; --$old_len; $real[] = $line; break; case ' ': if (!$old_len && !$new_len) { break 2; } --$old_len; --$new_len; $real[] = $line; break; default: // We hit something, likely another hunk. $hit_next_hunk = true; break 2; } } if ($old_len || $new_len) { $this->didFailParse(pht('Found the wrong number of hunk lines.')); } $corpus = implode('', $real); $is_binary = false; if ($this->detectBinaryFiles) { $is_binary = !phutil_is_utf8($corpus); $try_encoding = $this->tryEncoding; if ($is_binary && $try_encoding) { $is_binary = ArcanistDiffUtils::isHeuristicBinaryFile($corpus); if (!$is_binary) { $corpus = phutil_utf8_convert($corpus, 'UTF-8', $try_encoding); if (!phutil_is_utf8($corpus)) { throw new Exception(pht("Failed to convert a hunk from '%s' to UTF-8. " . "Check that the specified encoding is correct.", $try_encoding)); } } } } if ($is_binary) { // SVN happily treats binary files which aren't marked with the right // mime type as text files. Detect that junk here and mark the file // binary. We'll catch stuff with unicode too, but that's verboten // anyway. If there are too many false positives with this we might // need to make it threshold-triggered instead of triggering on any // unprintable byte. $change->setFileType(ArcanistDiffChangeType::FILE_BINARY); } else { $hunk->setCorpus($corpus); $hunk->setAddLines($add); $hunk->setDelLines($del); $change->addHunk($hunk); } if (!$hit_next_hunk) { $line = $this->nextNonemptyLine(); } } while (preg_match('/^@@ /', $line)); }
protected function parseChangeset(ArcanistDiffChange $change) { $all_changes = array(); do { $hunk = new ArcanistDiffHunk(); $line = $this->getLine(); $real = array(); // In the case where only one line is changed, the length is omitted. // The final group is for git, which appends a guess at the function // context to the diff. $matches = null; $ok = preg_match('/^@@ -(\\d+)(?:,(\\d+))? \\+(\\d+)(?:,(\\d+))? @@(?: .*?)?$/U', $line, $matches); if (!$ok) { // It's possible we hit the style of an svn1.7 property change. // This is a 4-line Index block, followed by an empty line, followed // by a "Property changes on:" section similar to svn1.6. if ($line == '') { $line = $this->nextNonemptyLine(); $ok = preg_match('/^Property changes on:/', $line); if (!$ok) { $this->didFailParse("Confused by empty line"); } $line = $this->nextLine(); return $this->parsePropertyHunk($change); } $this->didFailParse("Expected hunk header '@@ -NN,NN +NN,NN @@'."); } $hunk->setOldOffset($matches[1]); $hunk->setNewOffset($matches[3]); // Cover for the cases where length wasn't present (implying one line). $old_len = idx($matches, 2); if (!strlen($old_len)) { $old_len = 1; } $new_len = idx($matches, 4); if (!strlen($new_len)) { $new_len = 1; } $hunk->setOldLength($old_len); $hunk->setNewLength($new_len); $add = 0; $del = 0; $advance = false; while (($line = $this->nextLine()) !== null) { if (strlen($line)) { $char = $line[0]; } else { $char = '~'; } switch ($char) { case '\\': if (!preg_match('@\\ No newline at end of file@', $line)) { $this->didFailParse("Expected '\\ No newline at end of file'."); } if ($new_len) { $real[] = $line; $hunk->setIsMissingOldNewline(true); } else { $real[] = $line; $hunk->setIsMissingNewNewline(true); } if (!$new_len) { $advance = true; break 2; } break; case '+': if (!$new_len) { break 2; } ++$add; --$new_len; $real[] = $line; break; case '-': if (!$old_len) { break 2; } ++$del; --$old_len; $real[] = $line; break; case ' ': if (!$old_len && !$new_len) { break 2; } --$old_len; --$new_len; $real[] = $line; break; case '~': $advance = true; break 2; default: break 2; } } if ($old_len != 0 || $new_len != 0) { $this->didFailParse("Found the wrong number of hunk lines."); } $corpus = implode("\n", $real); $is_binary = false; if ($this->detectBinaryFiles) { $is_binary = !phutil_is_utf8($corpus); if ($is_binary && $this->tryEncoding) { $is_binary = ArcanistDiffUtils::isHeuristicBinaryFile($corpus); if (!$is_binary) { // NOTE: This feature is HIGHLY EXPERIMENTAL and will cause a lot // of issues. Use it at your own risk. $corpus = mb_convert_encoding($corpus, 'UTF-8', $this->tryEncoding); if (!phutil_is_utf8($corpus)) { throw new Exception('Failed converting hunk to ' . $this->tryEncoding); } } } } if ($is_binary) { // SVN happily treats binary files which aren't marked with the right // mime type as text files. Detect that junk here and mark the file // binary. We'll catch stuff with unicode too, but that's verboten // anyway. If there are too many false positives with this we might // need to make it threshold-triggered instead of triggering on any // unprintable byte. $change->setFileType(ArcanistDiffChangeType::FILE_BINARY); } else { $hunk->setCorpus($corpus); $hunk->setAddLines($add); $hunk->setDelLines($del); $change->addHunk($hunk); } if ($advance) { $line = $this->nextNonemptyLine(); } } while (preg_match('/^@@ /', $line)); }