Merge branch 'jk/fast-export-anonym-alt'

"git fast-export --anonymize" learned to take customized mapping to
allow its users to tweak its output more usable for debugging.

* jk/fast-export-anonym-alt:
  fast-export: use local array to store anonymized oid
  fast-export: anonymize "master" refname
  fast-export: allow seeding the anonymized mapping
  fast-export: add a "data" callback parameter to anonymize_str()
  fast-export: move global "idents" anonymize hashmap into function
  fast-export: use a flex array to store anonymized entries
  fast-export: stop storing lengths in anonymized hashmaps
  fast-export: tighten anonymize_mem() interface to handle only strings
  fast-export: store anonymized oids as hex strings
  fast-export: use xmemdupz() for anonymizing oids
  t9351: derive anonymized tree checks from original repo
This commit is contained in:
Junio C Hamano 2020-07-06 22:09:17 -07:00
commit 0a23331aa6
3 changed files with 174 additions and 71 deletions

View File

@ -119,6 +119,11 @@ by keeping the marks the same across runs.
the shape of the history and stored tree. See the section on
`ANONYMIZING` below.
--anonymize-map=<from>[:<to>]::
Convert token `<from>` to `<to>` in the anonymized output. If
`<to>` is omitted, map `<from>` to itself (i.e., do not
anonymize it). See the section on `ANONYMIZING` below.
--reference-excluded-parents::
By default, running a command such as `git fast-export
master~5..master` will not include the commit master{tilde}5
@ -238,6 +243,30 @@ collapse "User 0", "User 1", etc into "User X"). This produces a much
smaller output, and it is usually easy to quickly confirm that there is
no private data in the stream.
Reproducing some bugs may require referencing particular commits or
paths, which becomes challenging after refnames and paths have been
anonymized. You can ask for a particular token to be left as-is or
mapped to a new value. For example, if you have a bug which reproduces
with `git rev-list sensitive -- secret.c`, you can run:
---------------------------------------------------
$ git fast-export --anonymize --all \
--anonymize-map=sensitive:foo \
--anonymize-map=secret.c:bar.c \
>stream
---------------------------------------------------
After importing the stream, you can then run `git rev-list foo -- bar.c`
in the anonymized repository.
Note that paths and refnames are split into tokens at slash boundaries.
The command above would anonymize `subdir/secret.c` as something like
`path123/bar.c`; you could then search for `bar.c` in the anonymized
repository to determine the final pathname.
To make referencing the final pathname simpler, you can map each path
component; so if you also anonymize `subdir` to `publicdir`, then the
final pathname would be `publicdir/bar.c`.
LIMITATIONS
-----------

View File

@ -45,6 +45,7 @@ static struct string_list extra_refs = STRING_LIST_INIT_NODUP;
static struct string_list tag_refs = STRING_LIST_INIT_NODUP;
static struct refspec refspecs = REFSPEC_INIT_FETCH;
static int anonymize;
static struct hashmap anonymized_seeds;
static struct revision_sources revision_sources;
static int parse_opt_signed_tag_mode(const struct option *opt,
@ -119,25 +120,34 @@ static int has_unshown_parent(struct commit *commit)
}
struct anonymized_entry {
struct hashmap_entry hash;
const char *anon;
const char orig[FLEX_ARRAY];
};
struct anonymized_entry_key {
struct hashmap_entry hash;
const char *orig;
size_t orig_len;
const char *anon;
size_t anon_len;
};
static int anonymized_entry_cmp(const void *unused_cmp_data,
const struct hashmap_entry *eptr,
const struct hashmap_entry *entry_or_key,
const void *unused_keydata)
const void *keydata)
{
const struct anonymized_entry *a, *b;
a = container_of(eptr, const struct anonymized_entry, hash);
b = container_of(entry_or_key, const struct anonymized_entry, hash);
if (keydata) {
const struct anonymized_entry_key *key = keydata;
int equal = !strncmp(a->orig, key->orig, key->orig_len) &&
!a->orig[key->orig_len];
return !equal;
}
return a->orig_len != b->orig_len ||
memcmp(a->orig, b->orig, a->orig_len);
b = container_of(entry_or_key, const struct anonymized_entry, hash);
return strcmp(a->orig, b->orig);
}
/*
@ -145,31 +155,39 @@ static int anonymized_entry_cmp(const void *unused_cmp_data,
* the same anonymized string with another. The actual generation
* is farmed out to the generate function.
*/
static const void *anonymize_mem(struct hashmap *map,
void *(*generate)(const void *, size_t *),
const void *orig, size_t *len)
static const char *anonymize_str(struct hashmap *map,
char *(*generate)(void *),
const char *orig, size_t len,
void *data)
{
struct anonymized_entry key, *ret;
struct anonymized_entry_key key;
struct anonymized_entry *ret;
if (!map->cmpfn)
hashmap_init(map, anonymized_entry_cmp, NULL, 0);
hashmap_entry_init(&key.hash, memhash(orig, *len));
hashmap_entry_init(&key.hash, memhash(orig, len));
key.orig = orig;
key.orig_len = *len;
ret = hashmap_get_entry(map, &key, hash, NULL);
key.orig_len = len;
/* First check if it's a token the user configured manually... */
if (anonymized_seeds.cmpfn)
ret = hashmap_get_entry(&anonymized_seeds, &key, hash, &key);
else
ret = NULL;
/* ...otherwise check if we've already seen it in this context... */
if (!ret)
ret = hashmap_get_entry(map, &key, hash, &key);
/* ...and finally generate a new mapping if necessary */
if (!ret) {
ret = xmalloc(sizeof(*ret));
FLEX_ALLOC_MEM(ret, orig, orig, len);
hashmap_entry_init(&ret->hash, key.hash.hash);
ret->orig = xstrdup(orig);
ret->orig_len = *len;
ret->anon = generate(orig, len);
ret->anon_len = *len;
ret->anon = generate(data);
hashmap_put(map, &ret->hash);
}
*len = ret->anon_len;
return ret->anon;
}
@ -181,13 +199,13 @@ static const void *anonymize_mem(struct hashmap *map,
*/
static void anonymize_path(struct strbuf *out, const char *path,
struct hashmap *map,
void *(*generate)(const void *, size_t *))
char *(*generate)(void *))
{
while (*path) {
const char *end_of_component = strchrnul(path, '/');
size_t len = end_of_component - path;
const char *c = anonymize_mem(map, generate, path, &len);
strbuf_add(out, c, len);
const char *c = anonymize_str(map, generate, path, len, NULL);
strbuf_addstr(out, c);
path = end_of_component;
if (*path)
strbuf_addch(out, *path++);
@ -361,12 +379,12 @@ static void print_path_1(const char *path)
printf("%s", path);
}
static void *anonymize_path_component(const void *path, size_t *len)
static char *anonymize_path_component(void *data)
{
static int counter;
struct strbuf out = STRBUF_INIT;
strbuf_addf(&out, "path%d", counter++);
return strbuf_detach(&out, len);
return strbuf_detach(&out, NULL);
}
static void print_path(const char *path)
@ -383,20 +401,23 @@ static void print_path(const char *path)
}
}
static void *generate_fake_oid(const void *old, size_t *len)
static char *generate_fake_oid(void *data)
{
static uint32_t counter = 1; /* avoid null oid */
const unsigned hashsz = the_hash_algo->rawsz;
unsigned char *out = xcalloc(hashsz, 1);
unsigned char out[GIT_MAX_RAWSZ];
char *hex = xmallocz(GIT_MAX_HEXSZ);
hashclr(out);
put_be32(out + hashsz - 4, counter++);
return out;
return hash_to_hex_algop_r(hex, out, the_hash_algo);
}
static const struct object_id *anonymize_oid(const struct object_id *oid)
static const char *anonymize_oid(const char *oid_hex)
{
static struct hashmap objs;
size_t len = the_hash_algo->rawsz;
return anonymize_mem(&objs, generate_fake_oid, oid, &len);
size_t len = strlen(oid_hex);
return anonymize_str(&objs, generate_fake_oid, oid_hex, len, NULL);
}
static void show_filemodify(struct diff_queue_struct *q,
@ -455,9 +476,9 @@ static void show_filemodify(struct diff_queue_struct *q,
*/
if (no_data || S_ISGITLINK(spec->mode))
printf("M %06o %s ", spec->mode,
oid_to_hex(anonymize ?
anonymize_oid(&spec->oid) :
&spec->oid));
anonymize ?
anonymize_oid(oid_to_hex(&spec->oid)) :
oid_to_hex(&spec->oid));
else {
struct object *object = lookup_object(the_repository,
&spec->oid);
@ -493,12 +514,12 @@ static const char *find_encoding(const char *begin, const char *end)
return bol;
}
static void *anonymize_ref_component(const void *old, size_t *len)
static char *anonymize_ref_component(void *data)
{
static int counter;
struct strbuf out = STRBUF_INIT;
strbuf_addf(&out, "ref%d", counter++);
return strbuf_detach(&out, len);
return strbuf_detach(&out, NULL);
}
static const char *anonymize_refname(const char *refname)
@ -517,13 +538,6 @@ static const char *anonymize_refname(const char *refname)
static struct strbuf anon = STRBUF_INIT;
int i;
/*
* We also leave "master" as a special case, since it does not reveal
* anything interesting.
*/
if (!strcmp(refname, "refs/heads/master"))
return refname;
strbuf_reset(&anon);
for (i = 0; i < ARRAY_SIZE(prefixes); i++) {
if (skip_prefix(refname, prefixes[i], &refname)) {
@ -546,14 +560,13 @@ static char *anonymize_commit_message(const char *old)
return xstrfmt("subject %d\n\nbody\n", counter++);
}
static struct hashmap idents;
static void *anonymize_ident(const void *old, size_t *len)
static char *anonymize_ident(void *data)
{
static int counter;
struct strbuf out = STRBUF_INIT;
strbuf_addf(&out, "User %d <user%d@example.com>", counter, counter);
counter++;
return strbuf_detach(&out, len);
return strbuf_detach(&out, NULL);
}
/*
@ -563,6 +576,7 @@ static void *anonymize_ident(const void *old, size_t *len)
*/
static void anonymize_ident_line(const char **beg, const char **end)
{
static struct hashmap idents;
static struct strbuf buffers[] = { STRBUF_INIT, STRBUF_INIT };
static unsigned which_buffer;
@ -588,9 +602,9 @@ static void anonymize_ident_line(const char **beg, const char **end)
size_t len;
len = split.mail_end - split.name_begin;
ident = anonymize_mem(&idents, anonymize_ident,
split.name_begin, &len);
strbuf_add(out, ident, len);
ident = anonymize_str(&idents, anonymize_ident,
split.name_begin, len, NULL);
strbuf_addstr(out, ident);
strbuf_addch(out, ' ');
strbuf_add(out, split.date_begin, split.tz_end - split.date_begin);
} else {
@ -712,9 +726,10 @@ static void handle_commit(struct commit *commit, struct rev_info *rev,
if (mark)
printf(":%d\n", mark);
else
printf("%s\n", oid_to_hex(anonymize ?
anonymize_oid(&obj->oid) :
&obj->oid));
printf("%s\n",
anonymize ?
anonymize_oid(oid_to_hex(&obj->oid)) :
oid_to_hex(&obj->oid));
i++;
}
@ -729,12 +744,12 @@ static void handle_commit(struct commit *commit, struct rev_info *rev,
show_progress();
}
static void *anonymize_tag(const void *old, size_t *len)
static char *anonymize_tag(void *data)
{
static int counter;
struct strbuf out = STRBUF_INIT;
strbuf_addf(&out, "tag message %d", counter++);
return strbuf_detach(&out, len);
return strbuf_detach(&out, NULL);
}
static void handle_tail(struct object_array *commits, struct rev_info *revs,
@ -804,8 +819,8 @@ static void handle_tag(const char *name, struct tag *tag)
name = anonymize_refname(name);
if (message) {
static struct hashmap tags;
message = anonymize_mem(&tags, anonymize_tag,
message, &message_size);
message = anonymize_str(&tags, anonymize_tag,
message, message_size, NULL);
}
}
@ -1136,6 +1151,37 @@ static void handle_deletes(void)
}
}
static char *anonymize_seed(void *data)
{
return xstrdup(data);
}
static int parse_opt_anonymize_map(const struct option *opt,
const char *arg, int unset)
{
struct hashmap *map = opt->value;
const char *delim, *value;
size_t keylen;
BUG_ON_OPT_NEG(unset);
delim = strchr(arg, ':');
if (delim) {
keylen = delim - arg;
value = delim + 1;
} else {
keylen = strlen(arg);
value = arg;
}
if (!keylen || !*value)
return error(_("--anonymize-map token cannot be empty"));
anonymize_str(map, anonymize_seed, arg, keylen, (void *)value);
return 0;
}
int cmd_fast_export(int argc, const char **argv, const char *prefix)
{
struct rev_info revs;
@ -1177,6 +1223,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix)
OPT_STRING_LIST(0, "refspec", &refspecs_list, N_("refspec"),
N_("Apply refspec to exported refs")),
OPT_BOOL(0, "anonymize", &anonymize, N_("anonymize output")),
OPT_CALLBACK_F(0, "anonymize-map", &anonymized_seeds, N_("from:to"),
N_("convert <from> to <to> in anonymized output"),
PARSE_OPT_NONEG, parse_opt_anonymize_map),
OPT_BOOL(0, "reference-excluded-parents",
&reference_excluded_commits, N_("Reference parents which are not in fast-export stream by object id")),
OPT_BOOL(0, "show-original-ids", &show_original_ids,
@ -1204,6 +1253,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix)
if (argc > 1)
usage_with_options (fast_export_usage, options);
if (anonymized_seeds.cmpfn && !anonymize)
die(_("--anonymize-map without --anonymize does not make sense"));
if (refspecs_list.nr) {
int i;

View File

@ -6,15 +6,24 @@ test_description='basic tests for fast-export --anonymize'
test_expect_success 'setup simple repo' '
test_commit base &&
test_commit foo &&
test_commit retain-me &&
git checkout -b other HEAD^ &&
mkdir subdir &&
test_commit subdir/bar &&
test_commit subdir/xyzzy &&
fake_commit=$(echo $ZERO_OID | sed s/0/a/) &&
git update-index --add --cacheinfo 160000,$fake_commit,link1 &&
git update-index --add --cacheinfo 160000,$fake_commit,link2 &&
git commit -m "add gitlink" &&
git tag -m "annotated tag" mytag
'
test_expect_success 'export anonymized stream' '
git fast-export --anonymize --all >stream
git fast-export --anonymize --all \
--anonymize-map=retain-me \
--anonymize-map=xyzzy:custom-name \
--anonymize-map=other \
>stream
'
# this also covers commit messages
@ -26,12 +35,23 @@ test_expect_success 'stream omits path names' '
! grep xyzzy stream
'
test_expect_success 'stream allows master as refname' '
grep master stream
test_expect_success 'stream contains user-specified names' '
grep retain-me stream &&
grep custom-name stream
'
test_expect_success 'stream omits gitlink oids' '
# avoid relying on the whole oid to remain hash-agnostic; this is
# plenty to be unique within our test case
! grep a000000000000000000 stream
'
test_expect_success 'stream retains other as refname' '
grep other stream
'
test_expect_success 'stream omits other refnames' '
! grep other stream &&
! grep master stream &&
! grep mytag stream
'
@ -57,7 +77,8 @@ test_expect_success 'import stream to new repository' '
test_expect_success 'result has two branches' '
git for-each-ref --format="%(refname)" refs/heads >branches &&
test_line_count = 2 branches &&
other_branch=$(grep -v refs/heads/master branches)
other_branch=refs/heads/other &&
main_branch=$(grep -v $other_branch branches)
'
test_expect_success 'repo has original shape and timestamps' '
@ -65,34 +86,35 @@ test_expect_success 'repo has original shape and timestamps' '
git log --format="%m %ct" --left-right --boundary "$@"
} &&
(cd .. && shape master...other) >expect &&
shape master...$other_branch >actual &&
shape $main_branch...$other_branch >actual &&
test_cmp expect actual
'
test_expect_success 'root tree has original shape' '
# the output entries are not necessarily in the same
# order, but we know at least that we will have one tree
# and one blob, so just check the sorted order
cat >expect <<-\EOF &&
blob
tree
EOF
# order, but we should at least have the same set of
# object types.
git -C .. ls-tree HEAD >orig-root &&
cut -d" " -f2 <orig-root | sort >expect &&
git ls-tree $other_branch >root &&
cut -d" " -f2 <root | sort >actual &&
test_cmp expect actual
'
test_expect_success 'paths in subdir ended up in one tree' '
cat >expect <<-\EOF &&
blob
blob
EOF
git -C .. ls-tree other:subdir >orig-subdir &&
cut -d" " -f2 <orig-subdir | sort >expect &&
tree=$(grep tree root | cut -f2) &&
git ls-tree $other_branch:$tree >tree &&
cut -d" " -f2 <tree >actual &&
test_cmp expect actual
'
test_expect_success 'identical gitlinks got identical oid' '
awk "/commit/ { print \$3 }" <root | sort -u >commits &&
test_line_count = 1 commits
'
test_expect_success 'tag points to branch tip' '
git rev-parse $other_branch >expect &&
git for-each-ref --format="%(*objectname)" | grep . >actual &&