Merge branch 'jz/patch-id'

A new "--include-whitespace" option is added to "git patch-id", and
existing bugs in the internal patch-id logic that did not match
what "git patch-id" produces have been corrected.

* jz/patch-id:
  builtin: patch-id: remove unused diff-tree prefix
  builtin: patch-id: add --verbatim as a command mode
  patch-id: fix patch-id for mode changes
  builtin: patch-id: fix patch-id with binary diffs
  patch-id: use stable patch-id for rebases
  patch-id: fix stable patch id for binary / header-only
This commit is contained in:
Taylor Blau 2022-10-30 21:04:41 -04:00
commit 160314e625
9 changed files with 284 additions and 96 deletions

View File

@ -8,18 +8,18 @@ git-patch-id - Compute unique ID for a patch
SYNOPSIS
--------
[verse]
'git patch-id' [--stable | --unstable]
'git patch-id' [--stable | --unstable | --verbatim]
DESCRIPTION
-----------
Read a patch from the standard input and compute the patch ID for it.
A "patch ID" is nothing but a sum of SHA-1 of the file diffs associated with a
patch, with whitespace and line numbers ignored. As such, it's "reasonably
stable", but at the same time also reasonably unique, i.e., two patches that
have the same "patch ID" are almost guaranteed to be the same thing.
patch, with line numbers ignored. As such, it's "reasonably stable", but at
the same time also reasonably unique, i.e., two patches that have the same
"patch ID" are almost guaranteed to be the same thing.
IOW, you can use this thing to look for likely duplicate commits.
The main usecase for this command is to look for likely duplicate commits.
When dealing with 'git diff-tree' output, it takes advantage of
the fact that the patch is prefixed with the object name of the
@ -30,6 +30,12 @@ This can be used to make a mapping from patch ID to commit ID.
OPTIONS
-------
--verbatim::
Calculate the patch-id of the input as it is given, do not strip
any whitespace.
This is the default if patchid.verbatim is true.
--stable::
Use a "stable" sum of hashes as the patch ID. With this option:
- Reordering file diffs that make up a patch does not affect the ID.
@ -45,14 +51,16 @@ OPTIONS
of "-O<orderfile>", thereby making existing databases storing such
"unstable" or historical patch-ids unusable.
- All whitespace within the patch is ignored and does not affect the id.
This is the default if patchid.stable is set to true.
--unstable::
Use an "unstable" hash as the patch ID. With this option,
the result produced is compatible with the patch-id value produced
by git 1.9 and older. Users with pre-existing databases storing
patch-ids produced by git 1.9 and older (who do not deal with reordered
patches) may want to use this option.
by git 1.9 and older and whitespace is ignored. Users with pre-existing
databases storing patch-ids produced by git 1.9 and older (who do not deal
with reordered patches) may want to use this option.
This is the default.

View File

@ -1763,7 +1763,7 @@ static void prepare_bases(struct base_tree_info *bases,
struct object_id *patch_id;
if (*commit_base_at(&commit_base, commit))
continue;
if (commit_patch_id(commit, &diffopt, &oid, 0, 1))
if (commit_patch_id(commit, &diffopt, &oid, 0))
die(_("cannot get patch id"));
ALLOC_GROW(bases->patch_id, bases->nr_patch_id + 1, bases->alloc_patch_id);
patch_id = bases->patch_id + bases->nr_patch_id;

View File

@ -2,6 +2,7 @@
#include "builtin.h"
#include "config.h"
#include "diff.h"
#include "parse-options.h"
static void flush_current_id(int patchlen, struct object_id *id, struct object_id *result)
{
@ -57,10 +58,12 @@ static int scan_hunk_header(const char *p, int *p_before, int *p_after)
}
static int get_one_patchid(struct object_id *next_oid, struct object_id *result,
struct strbuf *line_buf, int stable)
struct strbuf *line_buf, int stable, int verbatim)
{
int patchlen = 0, found_next = 0;
int before = -1, after = -1;
int diff_is_binary = 0;
char pre_oid_str[GIT_MAX_HEXSZ + 1], post_oid_str[GIT_MAX_HEXSZ + 1];
git_hash_ctx ctx;
the_hash_algo->init_fn(&ctx);
@ -71,11 +74,14 @@ static int get_one_patchid(struct object_id *next_oid, struct object_id *result,
const char *p = line;
int len;
if (!skip_prefix(line, "diff-tree ", &p) &&
!skip_prefix(line, "commit ", &p) &&
/* Possibly skip over the prefix added by "log" or "format-patch" */
if (!skip_prefix(line, "commit ", &p) &&
!skip_prefix(line, "From ", &p) &&
starts_with(line, "\\ ") && 12 < strlen(line))
starts_with(line, "\\ ") && 12 < strlen(line)) {
if (verbatim)
the_hash_algo->update_fn(&ctx, line, strlen(line));
continue;
}
if (!get_oid_hex(p, next_oid)) {
found_next = 1;
@ -88,14 +94,44 @@ static int get_one_patchid(struct object_id *next_oid, struct object_id *result,
/* Parsing diff header? */
if (before == -1) {
if (starts_with(line, "index "))
if (starts_with(line, "GIT binary patch") ||
starts_with(line, "Binary files")) {
diff_is_binary = 1;
before = 0;
the_hash_algo->update_fn(&ctx, pre_oid_str,
strlen(pre_oid_str));
the_hash_algo->update_fn(&ctx, post_oid_str,
strlen(post_oid_str));
if (stable)
flush_one_hunk(result, &ctx);
continue;
else if (starts_with(line, "--- "))
} else if (skip_prefix(line, "index ", &p)) {
char *oid1_end = strstr(line, "..");
char *oid2_end = NULL;
if (oid1_end)
oid2_end = strstr(oid1_end, " ");
if (!oid2_end)
oid2_end = line + strlen(line) - 1;
if (oid1_end != NULL && oid2_end != NULL) {
*oid1_end = *oid2_end = '\0';
strlcpy(pre_oid_str, p, GIT_MAX_HEXSZ + 1);
strlcpy(post_oid_str, oid1_end + 2, GIT_MAX_HEXSZ + 1);
}
continue;
} else if (starts_with(line, "--- "))
before = after = 1;
else if (!isalpha(line[0]))
break;
}
if (diff_is_binary) {
if (starts_with(line, "diff ")) {
diff_is_binary = 0;
before = -1;
}
continue;
}
/* Looking for a valid hunk header? */
if (before == 0 && after == 0) {
if (starts_with(line, "@@ -")) {
@ -120,8 +156,8 @@ static int get_one_patchid(struct object_id *next_oid, struct object_id *result,
if (line[0] == '+' || line[0] == ' ')
after--;
/* Compute the sha without whitespace */
len = remove_space(line);
/* Add line to hash algo (possibly removing whitespace) */
len = verbatim ? strlen(line) : remove_space(line);
patchlen += len;
the_hash_algo->update_fn(&ctx, line, len);
}
@ -134,7 +170,7 @@ static int get_one_patchid(struct object_id *next_oid, struct object_id *result,
return patchlen;
}
static void generate_id_list(int stable)
static void generate_id_list(int stable, int verbatim)
{
struct object_id oid, n, result;
int patchlen;
@ -142,21 +178,32 @@ static void generate_id_list(int stable)
oidclr(&oid);
while (!feof(stdin)) {
patchlen = get_one_patchid(&n, &result, &line_buf, stable);
patchlen = get_one_patchid(&n, &result, &line_buf, stable, verbatim);
flush_current_id(patchlen, &oid, &result);
oidcpy(&oid, &n);
}
strbuf_release(&line_buf);
}
static const char patch_id_usage[] = "git patch-id [--stable | --unstable]";
static const char *const patch_id_usage[] = {
N_("git patch-id [--stable | --unstable | --verbatim]"), NULL
};
struct patch_id_opts {
int stable;
int verbatim;
};
static int git_patch_id_config(const char *var, const char *value, void *cb)
{
int *stable = cb;
struct patch_id_opts *opts = cb;
if (!strcmp(var, "patchid.stable")) {
*stable = git_config_bool(var, value);
opts->stable = git_config_bool(var, value);
return 0;
}
if (!strcmp(var, "patchid.verbatim")) {
opts->verbatim = git_config_bool(var, value);
return 0;
}
@ -165,21 +212,29 @@ static int git_patch_id_config(const char *var, const char *value, void *cb)
int cmd_patch_id(int argc, const char **argv, const char *prefix)
{
int stable = -1;
/* if nothing is set, default to unstable */
struct patch_id_opts config = {0, 0};
int opts = 0;
struct option builtin_patch_id_options[] = {
OPT_CMDMODE(0, "unstable", &opts,
N_("use the unstable patch-id algorithm"), 1),
OPT_CMDMODE(0, "stable", &opts,
N_("use the stable patch-id algorithm"), 2),
OPT_CMDMODE(0, "verbatim", &opts,
N_("don't strip whitespace from the patch"), 3),
OPT_END()
};
git_config(git_patch_id_config, &stable);
git_config(git_patch_id_config, &config);
/* If nothing is set, default to unstable. */
if (stable < 0)
stable = 0;
/* verbatim implies stable */
if (config.verbatim)
config.stable = 1;
if (argc == 2 && !strcmp(argv[1], "--stable"))
stable = 1;
else if (argc == 2 && !strcmp(argv[1], "--unstable"))
stable = 0;
else if (argc != 1)
usage(patch_id_usage);
argc = parse_options(argc, argv, prefix, builtin_patch_id_options,
patch_id_usage, 0);
generate_id_list(stable);
generate_id_list(opts ? opts > 1 : config.stable,
opts ? opts == 3 : config.verbatim);
return 0;
}

75
diff.c
View File

@ -6229,7 +6229,7 @@ static void patch_id_add_mode(git_hash_ctx *ctx, unsigned mode)
}
/* returns 0 upon success, and writes result into oid */
static int diff_get_patch_id(struct diff_options *options, struct object_id *oid, int diff_header_only, int stable)
static int diff_get_patch_id(struct diff_options *options, struct object_id *oid, int diff_header_only)
{
struct diff_queue_struct *q = &diff_queued_diff;
int i;
@ -6276,61 +6276,62 @@ static int diff_get_patch_id(struct diff_options *options, struct object_id *oid
if (p->one->mode == 0) {
patch_id_add_string(&ctx, "newfilemode");
patch_id_add_mode(&ctx, p->two->mode);
patch_id_add_string(&ctx, "---/dev/null");
patch_id_add_string(&ctx, "+++b/");
the_hash_algo->update_fn(&ctx, p->two->path, len2);
} else if (p->two->mode == 0) {
patch_id_add_string(&ctx, "deletedfilemode");
patch_id_add_mode(&ctx, p->one->mode);
patch_id_add_string(&ctx, "---a/");
the_hash_algo->update_fn(&ctx, p->one->path, len1);
patch_id_add_string(&ctx, "+++/dev/null");
} else {
patch_id_add_string(&ctx, "---a/");
the_hash_algo->update_fn(&ctx, p->one->path, len1);
patch_id_add_string(&ctx, "+++b/");
the_hash_algo->update_fn(&ctx, p->two->path, len2);
} else if (p->one->mode != p->two->mode) {
patch_id_add_string(&ctx, "oldmode");
patch_id_add_mode(&ctx, p->one->mode);
patch_id_add_string(&ctx, "newmode");
patch_id_add_mode(&ctx, p->two->mode);
}
if (diff_header_only)
continue;
if (fill_mmfile(options->repo, &mf1, p->one) < 0 ||
fill_mmfile(options->repo, &mf2, p->two) < 0)
return error("unable to read files to diff");
if (diff_filespec_is_binary(options->repo, p->one) ||
if (diff_header_only) {
/* don't do anything since we're only populating header info */
} else if (diff_filespec_is_binary(options->repo, p->one) ||
diff_filespec_is_binary(options->repo, p->two)) {
the_hash_algo->update_fn(&ctx, oid_to_hex(&p->one->oid),
the_hash_algo->hexsz);
the_hash_algo->update_fn(&ctx, oid_to_hex(&p->two->oid),
the_hash_algo->hexsz);
continue;
} else {
if (p->one->mode == 0) {
patch_id_add_string(&ctx, "---/dev/null");
patch_id_add_string(&ctx, "+++b/");
the_hash_algo->update_fn(&ctx, p->two->path, len2);
} else if (p->two->mode == 0) {
patch_id_add_string(&ctx, "---a/");
the_hash_algo->update_fn(&ctx, p->one->path, len1);
patch_id_add_string(&ctx, "+++/dev/null");
} else {
patch_id_add_string(&ctx, "---a/");
the_hash_algo->update_fn(&ctx, p->one->path, len1);
patch_id_add_string(&ctx, "+++b/");
the_hash_algo->update_fn(&ctx, p->two->path, len2);
}
if (fill_mmfile(options->repo, &mf1, p->one) < 0 ||
fill_mmfile(options->repo, &mf2, p->two) < 0)
return error("unable to read files to diff");
xpp.flags = 0;
xecfg.ctxlen = 3;
xecfg.flags = XDL_EMIT_NO_HUNK_HDR;
if (xdi_diff_outf(&mf1, &mf2, NULL,
patch_id_consume, &data, &xpp, &xecfg))
return error("unable to generate patch-id diff for %s",
p->one->path);
}
xpp.flags = 0;
xecfg.ctxlen = 3;
xecfg.flags = XDL_EMIT_NO_HUNK_HDR;
if (xdi_diff_outf(&mf1, &mf2, NULL,
patch_id_consume, &data, &xpp, &xecfg))
return error("unable to generate patch-id diff for %s",
p->one->path);
if (stable)
flush_one_hunk(oid, &ctx);
flush_one_hunk(oid, &ctx);
}
if (!stable)
the_hash_algo->final_oid_fn(oid, &ctx);
return 0;
}
int diff_flush_patch_id(struct diff_options *options, struct object_id *oid, int diff_header_only, int stable)
int diff_flush_patch_id(struct diff_options *options, struct object_id *oid, int diff_header_only)
{
struct diff_queue_struct *q = &diff_queued_diff;
int i;
int result = diff_get_patch_id(options, oid, diff_header_only, stable);
int result = diff_get_patch_id(options, oid, diff_header_only);
for (i = 0; i < q->nr; i++)
diff_free_filepair(q->queue[i]);

2
diff.h
View File

@ -634,7 +634,7 @@ int run_diff_files(struct rev_info *revs, unsigned int option);
int run_diff_index(struct rev_info *revs, unsigned int option);
int do_diff_cache(const struct object_id *, struct diff_options *);
int diff_flush_patch_id(struct diff_options *, struct object_id *, int, int);
int diff_flush_patch_id(struct diff_options *, struct object_id *, int);
void flush_one_hunk(struct object_id *result, git_hash_ctx *ctx);
int diff_result_code(struct diff_options *, int);

View File

@ -11,7 +11,7 @@ static int patch_id_defined(struct commit *commit)
}
int commit_patch_id(struct commit *commit, struct diff_options *options,
struct object_id *oid, int diff_header_only, int stable)
struct object_id *oid, int diff_header_only)
{
if (!patch_id_defined(commit))
return -1;
@ -22,7 +22,7 @@ int commit_patch_id(struct commit *commit, struct diff_options *options,
else
diff_root_tree_oid(&commit->object.oid, "", options);
diffcore_std(options);
return diff_flush_patch_id(options, oid, diff_header_only, stable);
return diff_flush_patch_id(options, oid, diff_header_only);
}
/*
@ -48,11 +48,11 @@ static int patch_id_neq(const void *cmpfn_data,
b = container_of(entry_or_key, struct patch_id, ent);
if (is_null_oid(&a->patch_id) &&
commit_patch_id(a->commit, opt, &a->patch_id, 0, 0))
commit_patch_id(a->commit, opt, &a->patch_id, 0))
return error("Could not get patch ID for %s",
oid_to_hex(&a->commit->object.oid));
if (is_null_oid(&b->patch_id) &&
commit_patch_id(b->commit, opt, &b->patch_id, 0, 0))
commit_patch_id(b->commit, opt, &b->patch_id, 0))
return error("Could not get patch ID for %s",
oid_to_hex(&b->commit->object.oid));
return !oideq(&a->patch_id, &b->patch_id);
@ -82,7 +82,7 @@ static int init_patch_id_entry(struct patch_id *patch,
struct object_id header_only_patch_id;
patch->commit = commit;
if (commit_patch_id(commit, &ids->diffopts, &header_only_patch_id, 1, 0))
if (commit_patch_id(commit, &ids->diffopts, &header_only_patch_id, 1))
return -1;
hashmap_entry_init(&patch->ent, oidhash(&header_only_patch_id));

View File

@ -20,7 +20,7 @@ struct patch_ids {
};
int commit_patch_id(struct commit *commit, struct diff_options *options,
struct object_id *oid, int, int);
struct object_id *oid, int);
int init_patch_ids(struct repository *, struct patch_ids *);
int free_patch_ids(struct patch_ids *);

View File

@ -43,15 +43,26 @@ test_expect_success 'setup: 500 lines' '
git add newfile &&
git commit -q -m "add small file" &&
git cherry-pick main >/dev/null 2>&1
'
git cherry-pick main >/dev/null 2>&1 &&
test_expect_success 'setup attributes' '
echo "file binary" >.gitattributes
git branch -f squashed main &&
git checkout -q -f squashed &&
git reset -q --soft HEAD~2 &&
git commit -q -m squashed &&
git branch -f mode main &&
git checkout -q -f mode &&
test_chmod +x file &&
git commit -q -a --amend &&
git branch -f modeother other &&
git checkout -q -f modeother &&
test_chmod +x file &&
git commit -q -a --amend
'
test_expect_success 'detect upstream patch' '
git checkout -q main &&
git checkout -q main^{} &&
scramble file &&
git add file &&
git commit -q -m "change big file again" &&
@ -61,14 +72,46 @@ test_expect_success 'detect upstream patch' '
test_must_be_empty revs
'
test_expect_success 'detect upstream patch binary' '
echo "file binary" >.gitattributes &&
git checkout -q other^{} &&
git rebase main &&
git rev-list main...HEAD~ >revs &&
test_must_be_empty revs &&
test_when_finished "rm .gitattributes"
'
test_expect_success 'detect upstream patch modechange' '
git checkout -q modeother^{} &&
git rebase mode &&
git rev-list mode...HEAD~ >revs &&
test_must_be_empty revs
'
test_expect_success 'do not drop patch' '
git branch -f squashed main &&
git checkout -q -f squashed &&
git reset -q --soft HEAD~2 &&
git commit -q -m squashed &&
git checkout -q other^{} &&
test_must_fail git rebase squashed &&
git rebase --quit
test_when_finished "git rebase --abort"
'
test_expect_success 'do not drop patch binary' '
echo "file binary" >.gitattributes &&
git checkout -q other^{} &&
test_must_fail git rebase squashed &&
test_when_finished "git rebase --abort" &&
test_when_finished "rm .gitattributes"
'
test_expect_success 'do not drop patch modechange' '
git checkout -q modeother^{} &&
git rebase other &&
cat >expected <<-\EOF &&
diff --git a/file b/file
old mode 100644
new mode 100755
EOF
git diff HEAD~ >modediff &&
test_cmp expected modediff
'
test_done

View File

@ -8,13 +8,13 @@ export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
. ./test-lib.sh
test_expect_success 'setup' '
as="a a a a a a a a" && # eight a
test_write_lines $as >foo &&
test_write_lines $as >bar &&
str="ab cd ef gh ij kl mn op" &&
test_write_lines $str >foo &&
test_write_lines $str >bar &&
git add foo bar &&
git commit -a -m initial &&
test_write_lines $as b >foo &&
test_write_lines $as b >bar &&
test_write_lines $str b >foo &&
test_write_lines $str b >bar &&
git commit -a -m first &&
git checkout -b same main &&
git commit --amend -m same-msg &&
@ -22,8 +22,23 @@ test_expect_success 'setup' '
echo c >foo &&
echo c >bar &&
git commit --amend -a -m notsame-msg &&
git checkout -b with_space main~ &&
cat >foo <<-\EOF &&
a b
c d
e f
g h
i j
k l
m n
op
EOF
cp foo bar &&
git add foo bar &&
git commit --amend -m "with spaces" &&
test_write_lines bar foo >bar-then-foo &&
test_write_lines foo bar >foo-then-bar
'
test_expect_success 'patch-id output is well-formed' '
@ -42,7 +57,7 @@ calc_patch_id () {
}
get_top_diff () {
git log -p -1 "$@" -O bar-then-foo --
git log -p -1 "$@" -O bar-then-foo --full-index --
}
get_patch_id () {
@ -61,6 +76,33 @@ test_expect_success 'patch-id detects inequality' '
get_patch_id notsame &&
! test_cmp patch-id_main patch-id_notsame
'
test_expect_success 'patch-id detects equality binary' '
cat >.gitattributes <<-\EOF &&
foo binary
bar binary
EOF
get_patch_id main &&
get_patch_id same &&
git log -p -1 --binary main >top-diff.output &&
calc_patch_id <top-diff.output main_binpatch &&
git log -p -1 --binary same >top-diff.output &&
calc_patch_id <top-diff.output same_binpatch &&
test_cmp patch-id_main patch-id_main_binpatch &&
test_cmp patch-id_same patch-id_same_binpatch &&
test_cmp patch-id_main patch-id_same &&
test_when_finished "rm .gitattributes"
'
test_expect_success 'patch-id detects inequality binary' '
cat >.gitattributes <<-\EOF &&
foo binary
bar binary
EOF
get_patch_id main &&
get_patch_id notsame &&
! test_cmp patch-id_main patch-id_notsame &&
test_when_finished "rm .gitattributes"
'
test_expect_success 'patch-id supports git-format-patch output' '
get_patch_id main &&
@ -101,9 +143,21 @@ test_patch_id_file_order () {
git format-patch -1 --stdout -O foo-then-bar >format-patch.output &&
calc_patch_id <format-patch.output "ordered-$name" "$@" &&
cmp_patch_id $relevant "$name" "ordered-$name"
}
test_patch_id_whitespace () {
relevant="$1"
shift
name="ws-${1}-$relevant"
shift
get_top_diff "main~" >top-diff.output &&
calc_patch_id <top-diff.output "$name" "$@" &&
get_top_diff "with_space" >top-diff.output &&
calc_patch_id <top-diff.output "ws-$name" "$@" &&
cmp_patch_id $relevant "$name" "ws-$name"
}
# combined test for options: add more tests here to make them
# run with all options
test_patch_id () {
@ -119,6 +173,14 @@ test_expect_success 'file order is relevant with --unstable' '
test_patch_id_file_order relevant --unstable --unstable
'
test_expect_success 'whitespace is relevant with --verbatim' '
test_patch_id_whitespace relevant --verbatim --verbatim
'
test_expect_success 'whitespace is irrelevant without --verbatim' '
test_patch_id_whitespace irrelevant --stable --stable
'
#Now test various option combinations.
test_expect_success 'default is unstable' '
test_patch_id relevant default
@ -134,6 +196,17 @@ test_expect_success 'patchid.stable = false is unstable' '
test_patch_id relevant patchid.stable=false
'
test_expect_success 'patchid.verbatim = true is correct and stable' '
test_config patchid.verbatim true &&
test_patch_id_whitespace relevant patchid.verbatim=true &&
test_patch_id irrelevant patchid.verbatim=true
'
test_expect_success 'patchid.verbatim = false is unstable' '
test_config patchid.verbatim false &&
test_patch_id relevant patchid.verbatim=false
'
test_expect_success '--unstable overrides patchid.stable = true' '
test_config patchid.stable true &&
test_patch_id relevant patchid.stable=true--unstable --unstable
@ -144,6 +217,11 @@ test_expect_success '--stable overrides patchid.stable = false' '
test_patch_id irrelevant patchid.stable=false--stable --stable
'
test_expect_success '--verbatim overrides patchid.stable = false' '
test_config patchid.stable false &&
test_patch_id_whitespace relevant stable=false--verbatim --verbatim
'
test_expect_success 'patch-id supports git-format-patch MIME output' '
get_patch_id main &&
git checkout same &&
@ -198,7 +276,10 @@ test_expect_success 'patch-id handles no-nl-at-eof markers' '
EOF
calc_patch_id nonl <nonl &&
calc_patch_id withnl <withnl &&
test_cmp patch-id_nonl patch-id_withnl
test_cmp patch-id_nonl patch-id_withnl &&
calc_patch_id nonl-inc-ws --verbatim <nonl &&
calc_patch_id withnl-inc-ws --verbatim <withnl &&
! test_cmp patch-id_nonl-inc-ws patch-id_withnl-inc-ws
'
test_expect_success 'patch-id handles diffs with one line of before/after' '