From 98037f2bf2fb82845179f3311d8f64f7e27d77c4 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 23 Jun 2020 17:47:00 +0000 Subject: [PATCH 01/10] commit-graph: place bloom_settings in context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Place an instance of struct bloom_settings into the struct write_commit_graph_context. This allows simplifying the function prototype of write_graph_chunk_bloom_data(). This will allow us to combine the function prototypes and use function pointers to simplify write_commit_graph_file(). By using a pointer, we can later replace the settings to match those that exist in the current commit-graph, in case a future Git version allows customization of these parameters. Reported-by: SZEDER Gábor Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- commit-graph.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index 887837e882..d0fedcd9b1 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -882,6 +882,7 @@ struct write_commit_graph_context { const struct split_commit_graph_opts *split_opts; size_t total_bloom_filter_data_size; + const struct bloom_filter_settings *bloom_settings; }; static void write_graph_chunk_fanout(struct hashfile *f, @@ -1103,8 +1104,7 @@ static void write_graph_chunk_bloom_indexes(struct hashfile *f, } static void write_graph_chunk_bloom_data(struct hashfile *f, - struct write_commit_graph_context *ctx, - const struct bloom_filter_settings *settings) + struct write_commit_graph_context *ctx) { struct commit **list = ctx->commits.list; struct commit **last = ctx->commits.list + ctx->commits.nr; @@ -1116,9 +1116,9 @@ static void write_graph_chunk_bloom_data(struct hashfile *f, _("Writing changed paths Bloom filters data"), ctx->commits.nr); - hashwrite_be32(f, settings->hash_version); - hashwrite_be32(f, settings->num_hashes); - hashwrite_be32(f, settings->bits_per_entry); + hashwrite_be32(f, ctx->bloom_settings->hash_version); + hashwrite_be32(f, ctx->bloom_settings->num_hashes); + hashwrite_be32(f, ctx->bloom_settings->bits_per_entry); while (list < last) { struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0); @@ -1541,6 +1541,8 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) struct object_id file_hash; const struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS; + ctx->bloom_settings = &bloom_settings; + if (ctx->split) { struct strbuf tmp_file = STRBUF_INIT; @@ -1642,7 +1644,7 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) write_graph_chunk_extra_edges(f, ctx); if (ctx->changed_paths) { write_graph_chunk_bloom_indexes(f, ctx); - write_graph_chunk_bloom_data(f, ctx, &bloom_settings); + write_graph_chunk_bloom_data(f, ctx); } if (ctx->num_commit_graphs_after > 1 && write_graph_chunk_base(f, ctx)) { From 7b671f8c2b6b7de511fd2f6587e4540c25764b61 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 23 Jun 2020 17:47:01 +0000 Subject: [PATCH 02/10] commit-graph: change test to die on parse, not load 43d3561 (commit-graph write: don't die if the existing graph is corrupt, 2019-03-25) introduced the GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD environment variable. This was created to verify that commit-graph was not loaded when writing a new non-incremental commit-graph. An upcoming change wants to load a commit-graph in some valuable cases, but we want to maintain that we don't trust the commit-graph data when writing our new file. Instead of dying on load, instead die if we ever try to parse a commit from the commit-graph. This functionally verifies the same intended behavior, but allows a more advanced feature in the next change. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- commit-graph.c | 12 ++++++++---- commit-graph.h | 2 +- t/t5318-commit-graph.sh | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index d0fedcd9b1..6a28d4a5a6 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -564,10 +564,6 @@ static int prepare_commit_graph(struct repository *r) return !!r->objects->commit_graph; r->objects->commit_graph_attempted = 1; - if (git_env_bool(GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD, 0)) - die("dying as requested by the '%s' variable on commit-graph load!", - GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD); - prepare_repo_settings(r); if (!git_env_bool(GIT_TEST_COMMIT_GRAPH, 0) && @@ -790,6 +786,14 @@ static int parse_commit_in_graph_one(struct repository *r, int parse_commit_in_graph(struct repository *r, struct commit *item) { + static int checked_env = 0; + + if (!checked_env && + git_env_bool(GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE, 0)) + die("dying as requested by the '%s' variable on commit-graph parse!", + GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE); + checked_env = 1; + if (!prepare_commit_graph(r)) return 0; return parse_commit_in_graph_one(r, r->objects->commit_graph, item); diff --git a/commit-graph.h b/commit-graph.h index 881c9b46e5..f0fb13e3f2 100644 --- a/commit-graph.h +++ b/commit-graph.h @@ -5,7 +5,7 @@ #include "object-store.h" #define GIT_TEST_COMMIT_GRAPH "GIT_TEST_COMMIT_GRAPH" -#define GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD "GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD" +#define GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE "GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE" #define GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS "GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS" /* diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index 1073f9e3cf..5ec01abdaa 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -436,7 +436,7 @@ corrupt_graph_verify() { cp $objdir/info/commit-graph commit-graph-pre-write-test fi && git status --short && - GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD=true git commit-graph write && + GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE=true git commit-graph write && git commit-graph verify } From 949197420e3da13c06c6a15fd4b4ed3120753c42 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Wed, 1 Jul 2020 13:27:23 +0000 Subject: [PATCH 03/10] bloom: fix logic in get_bloom_filter() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The get_bloom_filter() method is a bit complicated in some parts where it does not need to be. In particular, it needs to return a NULL filter only when compute_if_not_present is zero AND the filter data cannot be loaded from a commit-graph file. This currently happens by accident because the commit-graph does not load changed-path Bloom filters from an existing commit-graph when writing a new one. This will change in a later patch. Also clean up some style issues while we are here. One side-effect of returning a NULL filter is that the filters that are reported as "too large" will now be reported as NULL insead of length zero. This case was not properly covered before, so add a test. Further, remote the counting of the zero-length filters from revision.c and the trace2 logs. Helped-by: René Scharfe Helped-by: SZEDER Gábor Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- bloom.c | 14 ++++++-------- commit-graph.c | 8 ++++++-- revision.c | 7 ------- t/t4216-log-bloom.sh | 24 ++++++++++++++++++++++-- 4 files changed, 34 insertions(+), 19 deletions(-) diff --git a/bloom.c b/bloom.c index c38d1cff0c..2af5389795 100644 --- a/bloom.c +++ b/bloom.c @@ -186,7 +186,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r, struct diff_options diffopt; int max_changes = 512; - if (bloom_filters.slab_size == 0) + if (!bloom_filters.slab_size) return NULL; filter = bloom_filter_slab_at(&bloom_filters, c); @@ -194,16 +194,14 @@ struct bloom_filter *get_bloom_filter(struct repository *r, if (!filter->data) { load_commit_graph_info(r, c); if (c->graph_pos != COMMIT_NOT_FROM_GRAPH && - r->objects->commit_graph->chunk_bloom_indexes) { - if (load_bloom_filter_from_graph(r->objects->commit_graph, filter, c)) - return filter; - else - return NULL; - } + r->objects->commit_graph->chunk_bloom_indexes) + load_bloom_filter_from_graph(r->objects->commit_graph, filter, c); } - if (filter->data || !compute_if_not_present) + if (filter->data) return filter; + if (!compute_if_not_present) + return NULL; repo_diff_setup(r, &diffopt); diffopt.flags.recursive = 1; diff --git a/commit-graph.c b/commit-graph.c index 6a28d4a5a6..50ce039a53 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -1098,7 +1098,8 @@ static void write_graph_chunk_bloom_indexes(struct hashfile *f, while (list < last) { struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0); - cur_pos += filter->len; + size_t len = filter ? filter->len : 0; + cur_pos += len; display_progress(progress, ++i); hashwrite_be32(f, cur_pos); list++; @@ -1126,8 +1127,11 @@ static void write_graph_chunk_bloom_data(struct hashfile *f, while (list < last) { struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0); + size_t len = filter ? filter->len : 0; display_progress(progress, ++i); - hashwrite(f, filter->data, filter->len * sizeof(unsigned char)); + + if (len) + hashwrite(f, filter->data, len * sizeof(unsigned char)); list++; } diff --git a/revision.c b/revision.c index c644c66091..7339750af1 100644 --- a/revision.c +++ b/revision.c @@ -633,7 +633,6 @@ static unsigned int count_bloom_filter_maybe; static unsigned int count_bloom_filter_definitely_not; static unsigned int count_bloom_filter_false_positive; static unsigned int count_bloom_filter_not_present; -static unsigned int count_bloom_filter_length_zero; static void trace2_bloom_filter_statistics_atexit(void) { @@ -641,7 +640,6 @@ static void trace2_bloom_filter_statistics_atexit(void) jw_object_begin(&jw, 0); jw_object_intmax(&jw, "filter_not_present", count_bloom_filter_not_present); - jw_object_intmax(&jw, "zero_length_filter", count_bloom_filter_length_zero); jw_object_intmax(&jw, "maybe", count_bloom_filter_maybe); jw_object_intmax(&jw, "definitely_not", count_bloom_filter_definitely_not); jw_object_intmax(&jw, "false_positive", count_bloom_filter_false_positive); @@ -735,11 +733,6 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs, return -1; } - if (!filter->len) { - count_bloom_filter_length_zero++; - return -1; - } - result = bloom_filter_contains(filter, revs->bloom_key, revs->bloom_filter_settings); diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh index c7011f33e2..2761208e74 100755 --- a/t/t4216-log-bloom.sh +++ b/t/t4216-log-bloom.sh @@ -60,7 +60,7 @@ setup () { test_bloom_filters_used () { log_args=$1 - bloom_trace_prefix="statistics:{\"filter_not_present\":0,\"zero_length_filter\":0,\"maybe\"" + bloom_trace_prefix="statistics:{\"filter_not_present\":0,\"maybe\"" setup "$log_args" && grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" && test_cmp log_wo_bloom log_w_bloom && @@ -142,7 +142,7 @@ test_expect_success 'setup - add commit-graph to the chain with Bloom filters' ' test_bloom_filters_used_when_some_filters_are_missing () { log_args=$1 - bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"zero_length_filter\":0,\"maybe\":8,\"definitely_not\":6" + bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"maybe\":8,\"definitely_not\":6" setup "$log_args" && grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" && test_cmp log_wo_bloom log_w_bloom @@ -152,4 +152,24 @@ test_expect_success 'Use Bloom filters if they exist in the latest but not all c test_bloom_filters_used_when_some_filters_are_missing "-- A/B" ' +test_expect_success 'correctly report changes over limit' ' + git init 513changes && + ( + cd 513changes && + for i in $(test_seq 1 513) + do + echo $i >file$i.txt || return 1 + done && + git add . && + git commit -m "files" && + git commit-graph write --reachable --changed-paths && + for i in $(test_seq 1 513) + do + git -c core.commitGraph=false log -- file$i.txt >expect && + git log -- file$i.txt >actual && + test_cmp expect actual || return 1 + done + ) +' + test_done \ No newline at end of file From 0087a87ba8fc69b27dde0183ec24ade367a4aa5b Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Wed, 1 Jul 2020 13:27:24 +0000 Subject: [PATCH 04/10] commit-graph: persist existence of changed-paths The changed-path Bloom filters were released in v2.27.0, but have a significant drawback. A user can opt-in to writing the changed-path filters using the "--changed-paths" option to "git commit-graph write" but the next write will drop the filters unless that option is specified. This becomes even more important when considering the interaction with gc.writeCommitGraph (on by default) or fetch.writeCommitGraph (part of features.experimental). These config options trigger commit-graph writes that the user did not signal, and hence there is no --changed-paths option available. Allow a user that opts-in to the changed-path filters to persist the property of "my commit-graph has changed-path filters" automatically. A user can drop filters using the --no-changed-paths option. In the process, we need to be extremely careful to match the Bloom filter settings as specified by the commit-graph. This will allow future versions of Git to customize these settings, and the version with this change will persist those settings as commit-graphs are rewritten on top. Use the trace2 API to signal the settings used during the write, and check that output in a test after manually adjusting the correct bytes in the commit-graph file. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- Documentation/git-commit-graph.txt | 5 +++- builtin/commit-graph.c | 5 +++- commit-graph.c | 45 ++++++++++++++++++++++++++++-- commit-graph.h | 1 + t/t4216-log-bloom.sh | 17 ++++++++++- 5 files changed, 67 insertions(+), 6 deletions(-) diff --git a/Documentation/git-commit-graph.txt b/Documentation/git-commit-graph.txt index f4b13c005b..369b222b08 100644 --- a/Documentation/git-commit-graph.txt +++ b/Documentation/git-commit-graph.txt @@ -60,7 +60,10 @@ existing commit-graph file. With the `--changed-paths` option, compute and write information about the paths changed between a commit and it's first parent. This operation can take a while on large repositories. It provides significant performance gains -for getting history of a directory or a file with `git log -- `. +for getting history of a directory or a file with `git log -- `. If +this option is given, future commit-graph writes will automatically assume +that this option was intended. Use `--no-changed-paths` to stop storing this +data. + With the `--split` option, write the commit-graph as a chain of multiple commit-graph files stored in `/info/commit-graphs`. The new commits diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c index 59009837dc..ff7b177c33 100644 --- a/builtin/commit-graph.c +++ b/builtin/commit-graph.c @@ -151,6 +151,7 @@ static int graph_write(int argc, const char **argv) }; opts.progress = isatty(2); + opts.enable_changed_paths = -1; split_opts.size_multiple = 2; split_opts.max_commits = 0; split_opts.expire_time = 0; @@ -171,7 +172,9 @@ static int graph_write(int argc, const char **argv) flags |= COMMIT_GRAPH_WRITE_SPLIT; if (opts.progress) flags |= COMMIT_GRAPH_WRITE_PROGRESS; - if (opts.enable_changed_paths || + if (!opts.enable_changed_paths) + flags |= COMMIT_GRAPH_NO_WRITE_BLOOM_FILTERS; + if (opts.enable_changed_paths == 1 || git_env_bool(GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS, 0)) flags |= COMMIT_GRAPH_WRITE_BLOOM_FILTERS; diff --git a/commit-graph.c b/commit-graph.c index 50ce039a53..6762704324 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -16,6 +16,8 @@ #include "progress.h" #include "bloom.h" #include "commit-slab.h" +#include "json-writer.h" +#include "trace2.h" void git_test_write_commit_graph_or_die(void) { @@ -1108,6 +1110,21 @@ static void write_graph_chunk_bloom_indexes(struct hashfile *f, stop_progress(&progress); } +static void trace2_bloom_filter_settings(struct write_commit_graph_context *ctx) +{ + struct json_writer jw = JSON_WRITER_INIT; + + jw_object_begin(&jw, 0); + jw_object_intmax(&jw, "hash_version", ctx->bloom_settings->hash_version); + jw_object_intmax(&jw, "num_hashes", ctx->bloom_settings->num_hashes); + jw_object_intmax(&jw, "bits_per_entry", ctx->bloom_settings->bits_per_entry); + jw_end(&jw); + + trace2_data_json("bloom", ctx->r, "settings", &jw); + + jw_release(&jw); +} + static void write_graph_chunk_bloom_data(struct hashfile *f, struct write_commit_graph_context *ctx) { @@ -1116,6 +1133,8 @@ static void write_graph_chunk_bloom_data(struct hashfile *f, struct progress *progress = NULL; int i = 0; + trace2_bloom_filter_settings(ctx); + if (ctx->report_progress) progress = start_delayed_progress( _("Writing changed paths Bloom filters data"), @@ -1547,9 +1566,15 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) int num_chunks = 3; uint64_t chunk_offset; struct object_id file_hash; - const struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS; + struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS; - ctx->bloom_settings = &bloom_settings; + if (!ctx->bloom_settings) { + bloom_settings.bits_per_entry = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_BITS_PER_ENTRY", + bloom_settings.bits_per_entry); + bloom_settings.num_hashes = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_NUM_HASHES", + bloom_settings.num_hashes); + ctx->bloom_settings = &bloom_settings; + } if (ctx->split) { struct strbuf tmp_file = STRBUF_INIT; @@ -1974,9 +1999,23 @@ int write_commit_graph(struct object_directory *odb, ctx->split = flags & COMMIT_GRAPH_WRITE_SPLIT ? 1 : 0; ctx->check_oids = flags & COMMIT_GRAPH_WRITE_CHECK_OIDS ? 1 : 0; ctx->split_opts = split_opts; - ctx->changed_paths = flags & COMMIT_GRAPH_WRITE_BLOOM_FILTERS ? 1 : 0; ctx->total_bloom_filter_data_size = 0; + if (flags & COMMIT_GRAPH_WRITE_BLOOM_FILTERS) + ctx->changed_paths = 1; + if (!(flags & COMMIT_GRAPH_NO_WRITE_BLOOM_FILTERS)) { + struct commit_graph *g; + prepare_commit_graph_one(ctx->r, ctx->odb); + + g = ctx->r->objects->commit_graph; + + /* We have changed-paths already. Keep them in the next graph */ + if (g && g->chunk_bloom_data) { + ctx->changed_paths = 1; + ctx->bloom_settings = g->bloom_filter_settings; + } + } + if (ctx->split) { struct commit_graph *g; prepare_commit_graph(ctx->r); diff --git a/commit-graph.h b/commit-graph.h index f0fb13e3f2..45b1e5bca3 100644 --- a/commit-graph.h +++ b/commit-graph.h @@ -96,6 +96,7 @@ enum commit_graph_write_flags { /* Make sure that each OID in the input is a valid commit OID. */ COMMIT_GRAPH_WRITE_CHECK_OIDS = (1 << 3), COMMIT_GRAPH_WRITE_BLOOM_FILTERS = (1 << 4), + COMMIT_GRAPH_NO_WRITE_BLOOM_FILTERS = (1 << 5), }; struct split_commit_graph_opts { diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh index 2761208e74..52ad998f9e 100755 --- a/t/t4216-log-bloom.sh +++ b/t/t4216-log-bloom.sh @@ -126,7 +126,7 @@ test_expect_success 'setup - add commit-graph to the chain without Bloom filters test_commit c14 A/anotherFile2 && test_commit c15 A/B/anotherFile2 && test_commit c16 A/B/C/anotherFile2 && - GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0 git commit-graph write --reachable --split && + git commit-graph write --reachable --split --no-changed-paths && test_line_count = 2 .git/objects/info/commit-graphs/commit-graph-chain ' @@ -152,6 +152,21 @@ test_expect_success 'Use Bloom filters if they exist in the latest but not all c test_bloom_filters_used_when_some_filters_are_missing "-- A/B" ' +test_expect_success 'persist filter settings' ' + test_when_finished rm -rf .git/objects/info/commit-graph* && + rm -rf .git/objects/info/commit-graph* && + GIT_TRACE2_EVENT="$(pwd)/trace2.txt" \ + GIT_TRACE2_EVENT_NESTING=5 \ + GIT_TEST_BLOOM_SETTINGS_NUM_HASHES=9 \ + GIT_TEST_BLOOM_SETTINGS_BITS_PER_ENTRY=15 \ + git commit-graph write --reachable --changed-paths && + grep "{\"hash_version\":1,\"num_hashes\":9,\"bits_per_entry\":15}" trace2.txt && + GIT_TRACE2_EVENT="$(pwd)/trace2-auto.txt" \ + GIT_TRACE2_EVENT_NESTING=5 \ + git commit-graph write --reachable --changed-paths && + grep "{\"hash_version\":1,\"num_hashes\":9,\"bits_per_entry\":15}" trace2-auto.txt +' + test_expect_success 'correctly report changes over limit' ' git init 513changes && ( From 9bab081dfab8ebb4fcdf0d7e8a692626aeb29358 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= Date: Wed, 1 Jul 2020 13:27:25 +0000 Subject: [PATCH 05/10] commit-graph: unify the signatures of all write_graph_chunk_*() functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the write_graph_chunk_*() helper functions to have the same signature: - Return an int error code from all these functions. write_graph_chunk_base() already has an int error code, now the others will have one, too, but since they don't indicate any error, they will always return 0. - Drop the hash size parameter of write_graph_chunk_oids() and write_graph_chunk_data(); its value can be read directly from 'the_hash_algo' inside these functions as well. This opens up the possibility for further cleanups and foolproofing in the following two patches. Helped-by: René Scharfe Signed-off-by: SZEDER Gábor Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- commit-graph.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index 6762704324..1a6d26f864 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -891,8 +891,8 @@ struct write_commit_graph_context { const struct bloom_filter_settings *bloom_settings; }; -static void write_graph_chunk_fanout(struct hashfile *f, - struct write_commit_graph_context *ctx) +static int write_graph_chunk_fanout(struct hashfile *f, + struct write_commit_graph_context *ctx) { int i, count = 0; struct commit **list = ctx->commits.list; @@ -913,17 +913,21 @@ static void write_graph_chunk_fanout(struct hashfile *f, hashwrite_be32(f, count); } + + return 0; } -static void write_graph_chunk_oids(struct hashfile *f, int hash_len, - struct write_commit_graph_context *ctx) +static int write_graph_chunk_oids(struct hashfile *f, + struct write_commit_graph_context *ctx) { struct commit **list = ctx->commits.list; int count; for (count = 0; count < ctx->commits.nr; count++, list++) { display_progress(ctx->progress, ++ctx->progress_cnt); - hashwrite(f, (*list)->object.oid.hash, (int)hash_len); + hashwrite(f, (*list)->object.oid.hash, the_hash_algo->rawsz); } + + return 0; } static const unsigned char *commit_to_sha1(size_t index, void *table) @@ -932,8 +936,8 @@ static const unsigned char *commit_to_sha1(size_t index, void *table) return commits[index]->object.oid.hash; } -static void write_graph_chunk_data(struct hashfile *f, int hash_len, - struct write_commit_graph_context *ctx) +static int write_graph_chunk_data(struct hashfile *f, + struct write_commit_graph_context *ctx) { struct commit **list = ctx->commits.list; struct commit **last = ctx->commits.list + ctx->commits.nr; @@ -950,7 +954,7 @@ static void write_graph_chunk_data(struct hashfile *f, int hash_len, die(_("unable to parse commit %s"), oid_to_hex(&(*list)->object.oid)); tree = get_commit_tree_oid(*list); - hashwrite(f, tree->hash, hash_len); + hashwrite(f, tree->hash, the_hash_algo->rawsz); parent = (*list)->parents; @@ -1030,10 +1034,12 @@ static void write_graph_chunk_data(struct hashfile *f, int hash_len, list++; } + + return 0; } -static void write_graph_chunk_extra_edges(struct hashfile *f, - struct write_commit_graph_context *ctx) +static int write_graph_chunk_extra_edges(struct hashfile *f, + struct write_commit_graph_context *ctx) { struct commit **list = ctx->commits.list; struct commit **last = ctx->commits.list + ctx->commits.nr; @@ -1082,10 +1088,12 @@ static void write_graph_chunk_extra_edges(struct hashfile *f, list++; } + + return 0; } -static void write_graph_chunk_bloom_indexes(struct hashfile *f, - struct write_commit_graph_context *ctx) +static int write_graph_chunk_bloom_indexes(struct hashfile *f, + struct write_commit_graph_context *ctx) { struct commit **list = ctx->commits.list; struct commit **last = ctx->commits.list + ctx->commits.nr; @@ -1108,6 +1116,7 @@ static void write_graph_chunk_bloom_indexes(struct hashfile *f, } stop_progress(&progress); + return 0; } static void trace2_bloom_filter_settings(struct write_commit_graph_context *ctx) @@ -1125,8 +1134,8 @@ static void trace2_bloom_filter_settings(struct write_commit_graph_context *ctx) jw_release(&jw); } -static void write_graph_chunk_bloom_data(struct hashfile *f, - struct write_commit_graph_context *ctx) +static int write_graph_chunk_bloom_data(struct hashfile *f, + struct write_commit_graph_context *ctx) { struct commit **list = ctx->commits.list; struct commit **last = ctx->commits.list + ctx->commits.nr; @@ -1155,6 +1164,7 @@ static void write_graph_chunk_bloom_data(struct hashfile *f, } stop_progress(&progress); + return 0; } static int oid_compare(const void *_a, const void *_b) @@ -1671,8 +1681,8 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) num_chunks * ctx->commits.nr); } write_graph_chunk_fanout(f, ctx); - write_graph_chunk_oids(f, hashsz, ctx); - write_graph_chunk_data(f, hashsz, ctx); + write_graph_chunk_oids(f, ctx); + write_graph_chunk_data(f, ctx); if (ctx->num_extra_edges) write_graph_chunk_extra_edges(f, ctx); if (ctx->changed_paths) { From 17e6275fc9f5763fb89a1219aceaf14adf340a2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= Date: Wed, 1 Jul 2020 13:27:26 +0000 Subject: [PATCH 06/10] commit-graph: simplify chunk writes into loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In write_commit_graph_file() we now have one block of code filling the array of 'struct chunk_info' with the IDs and sizes of chunks to be written, and an other block of code calling the functions responsible for writing individual chunks. In case of optional chunks like Extra Edge List an Base Graphs List there is also a condition checking whether that chunk is necessary/desired, and that same condition is repeated in both blocks of code. Other, newer chunks have similar optional conditions. Eliminate these repeated conditions by storing the function pointers responsible for writing individual chunks in the 'struct chunk_info' array as well, and calling them in a loop to write the commit-graph file. This will open up the possibility for a bit of foolproofing in the following patch. Helped-by: René Scharfe Signed-off-by: SZEDER Gábor Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- commit-graph.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index 1a6d26f864..2b26a9dad3 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -1559,9 +1559,13 @@ static int write_graph_chunk_base(struct hashfile *f, return 0; } +typedef int (*chunk_write_fn)(struct hashfile *f, + struct write_commit_graph_context *ctx); + struct chunk_info { uint32_t id; uint64_t size; + chunk_write_fn write_fn; }; static int write_commit_graph_file(struct write_commit_graph_context *ctx) @@ -1624,27 +1628,34 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) chunks[0].id = GRAPH_CHUNKID_OIDFANOUT; chunks[0].size = GRAPH_FANOUT_SIZE; + chunks[0].write_fn = write_graph_chunk_fanout; chunks[1].id = GRAPH_CHUNKID_OIDLOOKUP; chunks[1].size = hashsz * ctx->commits.nr; + chunks[1].write_fn = write_graph_chunk_oids; chunks[2].id = GRAPH_CHUNKID_DATA; chunks[2].size = (hashsz + 16) * ctx->commits.nr; + chunks[2].write_fn = write_graph_chunk_data; if (ctx->num_extra_edges) { chunks[num_chunks].id = GRAPH_CHUNKID_EXTRAEDGES; chunks[num_chunks].size = 4 * ctx->num_extra_edges; + chunks[num_chunks].write_fn = write_graph_chunk_extra_edges; num_chunks++; } if (ctx->changed_paths) { chunks[num_chunks].id = GRAPH_CHUNKID_BLOOMINDEXES; chunks[num_chunks].size = sizeof(uint32_t) * ctx->commits.nr; + chunks[num_chunks].write_fn = write_graph_chunk_bloom_indexes; num_chunks++; chunks[num_chunks].id = GRAPH_CHUNKID_BLOOMDATA; chunks[num_chunks].size = sizeof(uint32_t) * 3 + ctx->total_bloom_filter_data_size; + chunks[num_chunks].write_fn = write_graph_chunk_bloom_data; num_chunks++; } if (ctx->num_commit_graphs_after > 1) { chunks[num_chunks].id = GRAPH_CHUNKID_BASE; chunks[num_chunks].size = hashsz * (ctx->num_commit_graphs_after - 1); + chunks[num_chunks].write_fn = write_graph_chunk_base; num_chunks++; } @@ -1680,19 +1691,12 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) progress_title.buf, num_chunks * ctx->commits.nr); } - write_graph_chunk_fanout(f, ctx); - write_graph_chunk_oids(f, ctx); - write_graph_chunk_data(f, ctx); - if (ctx->num_extra_edges) - write_graph_chunk_extra_edges(f, ctx); - if (ctx->changed_paths) { - write_graph_chunk_bloom_indexes(f, ctx); - write_graph_chunk_bloom_data(f, ctx); - } - if (ctx->num_commit_graphs_after > 1 && - write_graph_chunk_base(f, ctx)) { - return -1; + + for (i = 0; i < num_chunks; i++) { + if (chunks[i].write_fn(f, ctx)) + return -1; } + stop_progress(&ctx->progress); strbuf_release(&progress_title); From 2dd4fed92715fddcefc0223b79e51af2798e921e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= Date: Wed, 1 Jul 2020 13:27:27 +0000 Subject: [PATCH 07/10] commit-graph: check chunk sizes after writing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In my experience while experimenting with new commit-graph chunks, early versions of the corresponding new write_commit_graph_my_chunk() functions are, sadly but not surprisingly, often buggy, and write more or less data than they are supposed to, especially if the chunk size is not directly proportional to the number of commits. This then causes all kinds of issues when reading such a bogus commit-graph file, raising the question of whether the writing or the reading part happens to be buggy this time. Let's catch such issues early, already when writing the commit-graph file, and check that each write_graph_chunk_*() function wrote the amount of data that it was expected to, and what has been encoded in the Chunk Lookup table. Now that all commit-graph chunks are written in a loop we can do this check in a single place for all chunks, and any chunks added in the future will get checked as well. Helped-by: René Scharfe Signed-off-by: SZEDER Gábor Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- commit-graph.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/commit-graph.c b/commit-graph.c index 2b26a9dad3..6752916c1a 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -1693,8 +1693,15 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) } for (i = 0; i < num_chunks; i++) { + uint64_t start_offset = f->total + f->offset; + if (chunks[i].write_fn(f, ctx)) return -1; + + if (f->total + f->offset != start_offset + chunks[i].size) + BUG("expected to write %"PRId64" bytes to chunk %"PRIx32", but wrote %"PRId64" instead", + chunks[i].size, chunks[i].id, + f->total + f->offset - start_offset); } stop_progress(&ctx->progress); From dc8e95ba7c7a3b2f2d7cc82a1f66d19e10874218 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Wed, 1 Jul 2020 13:27:28 +0000 Subject: [PATCH 08/10] revision.c: fix whitespace Here, four spaces were used instead of tab characters. Reported-by: Taylor Blau Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- revision.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/revision.c b/revision.c index 7339750af1..ddf09ab0aa 100644 --- a/revision.c +++ b/revision.c @@ -695,11 +695,11 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs) /* remove single trailing slash from path, if needed */ if (pi->match[last_index] == '/') { - path_alloc = xstrdup(pi->match); - path_alloc[last_index] = '\0'; - path = path_alloc; + path_alloc = xstrdup(pi->match); + path_alloc[last_index] = '\0'; + path = path_alloc; } else - path = pi->match; + path = pi->match; len = strlen(path); From f3c2a36810d5322747ad573f7e2b8404d4f67ead Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Wed, 1 Jul 2020 13:27:29 +0000 Subject: [PATCH 09/10] revision: empty pathspecs should not use Bloom filters The prepare_to_use_bloom_filter() method was not intended to be called on an empty pathspec. However, 'git log -- .' and 'git log' are subtly different: the latter reports all commits while the former will simplify commits that do not change the root tree. This means that the path used to construct the bloom_key might be empty, and that value is not added to the Bloom filter during construction. That means that the results are likely incorrect! To resolve the issue, be careful about the length of the path and stop filling Bloom filters. To be completely sure we do not use them, drop the pointer to the bloom_filter_settings from the commit-graph. That allows our test to look at the trace2 logs to verify no Bloom filter statistics are reported. Signed-off-by: Taylor Blau Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- revision.c | 4 ++++ t/t4216-log-bloom.sh | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/revision.c b/revision.c index ddf09ab0aa..667ca36e1c 100644 --- a/revision.c +++ b/revision.c @@ -702,6 +702,10 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs) path = pi->match; len = strlen(path); + if (!len) { + revs->bloom_filter_settings = NULL; + return; + } revs->bloom_key = xmalloc(sizeof(struct bloom_key)); fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings); diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh index 52ad998f9e..29338f36bf 100755 --- a/t/t4216-log-bloom.sh +++ b/t/t4216-log-bloom.sh @@ -112,6 +112,10 @@ test_expect_success 'git log -- multiple path specs does not use Bloom filters' test_bloom_filters_not_used "-- file4 A/file1" ' +test_expect_success 'git log -- "." pathspec at root does not use Bloom filters' ' + test_bloom_filters_not_used "-- ." +' + test_expect_success 'git log with wildcard that resolves to a single path uses Bloom filters' ' test_bloom_filters_used "-- *4" && test_bloom_filters_used "-- *renamed" From c525ce95b46b34f344c360dbef036cec3ea08e53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= Date: Wed, 1 Jul 2020 13:27:30 +0000 Subject: [PATCH 10/10] commit-graph: check all leading directories in changed path Bloom filters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The file 'dir/subdir/file' can only be modified if its leading directories 'dir' and 'dir/subdir' are modified as well. So when checking modified path Bloom filters looking for commits modifying a path with multiple path components, then check not only the full path in the Bloom filters, but all its leading directories as well. Take care to check these paths in "deepest first" order, because it's the full path that is least likely to be modified, and the Bloom filter queries can short circuit sooner. This can significantly reduce the average false positive rate, by about an order of magnitude or three(!), and can further speed up pathspec-limited revision walks. The table below compares the average false positive rate and runtime of git rev-list HEAD -- "$path" before and after this change for 5000+ randomly* selected paths from each repository: Average false Average Average positive rate runtime runtime before after before after difference ------------------------------------------------------------------ git 3.220% 0.7853% 0.0558s 0.0387s -30.6% linux 2.453% 0.0296% 0.1046s 0.0766s -26.8% tensorflow 2.536% 0.6977% 0.0594s 0.0420s -29.2% *Path selection was done with the following pipeline: git ls-tree -r --name-only HEAD | sort -R | head -n 5000 The improvements in runtime are much smaller than the improvements in average false positive rate, as we are clearly reaching diminishing returns here. However, all these timings depend on that accessing tree objects is reasonably fast (warm caches). If we had a partial clone and the tree objects had to be fetched from a promisor remote, e.g.: $ git clone --filter=tree:0 --bare file://.../webkit.git webkit.notrees.git $ git -C webkit.git -c core.modifiedPathBloomFilters=1 \ commit-graph write --reachable $ cp webkit.git/objects/info/commit-graph webkit.notrees.git/objects/info/ $ git -C webkit.notrees.git -c core.modifiedPathBloomFilters=1 \ rev-list HEAD -- "$path" then checking all leading path component can reduce the runtime from over an hour to a few seconds (and this is with the clone and the promisor on the same machine). This adjusts the tracing values in t4216-log-bloom.sh, which provides a concrete way to notice the improvement. Helped-by: Taylor Blau Helped-by: René Scharfe Signed-off-by: SZEDER Gábor Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- revision.c | 46 +++++++++++++++++++++++++++++++++++--------- revision.h | 6 ++++-- t/t4216-log-bloom.sh | 2 +- 3 files changed, 42 insertions(+), 12 deletions(-) diff --git a/revision.c b/revision.c index 667ca36e1c..b9118001f9 100644 --- a/revision.c +++ b/revision.c @@ -668,9 +668,10 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs) { struct pathspec_item *pi; char *path_alloc = NULL; - const char *path; + const char *path, *p; int last_index; - int len; + size_t len; + int path_component_nr = 1; if (!revs->commits) return; @@ -707,8 +708,33 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs) return; } - revs->bloom_key = xmalloc(sizeof(struct bloom_key)); - fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings); + p = path; + while (*p) { + /* + * At this point, the path is normalized to use Unix-style + * path separators. This is required due to how the + * changed-path Bloom filters store the paths. + */ + if (*p == '/') + path_component_nr++; + p++; + } + + revs->bloom_keys_nr = path_component_nr; + ALLOC_ARRAY(revs->bloom_keys, revs->bloom_keys_nr); + + fill_bloom_key(path, len, &revs->bloom_keys[0], + revs->bloom_filter_settings); + path_component_nr = 1; + + p = path + len - 1; + while (p > path) { + if (*p == '/') + fill_bloom_key(path, p - path, + &revs->bloom_keys[path_component_nr++], + revs->bloom_filter_settings); + p--; + } if (trace2_is_enabled() && !bloom_filter_atexit_registered) { atexit(trace2_bloom_filter_statistics_atexit); @@ -722,7 +748,7 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs, struct commit *commit) { struct bloom_filter *filter; - int result; + int result = 1, j; if (!revs->repo->objects->commit_graph) return -1; @@ -737,9 +763,11 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs, return -1; } - result = bloom_filter_contains(filter, - revs->bloom_key, - revs->bloom_filter_settings); + for (j = 0; result && j < revs->bloom_keys_nr; j++) { + result = bloom_filter_contains(filter, + &revs->bloom_keys[j], + revs->bloom_filter_settings); + } if (result) count_bloom_filter_maybe++; @@ -779,7 +807,7 @@ static int rev_compare_tree(struct rev_info *revs, return REV_TREE_SAME; } - if (revs->bloom_key && !nth_parent) { + if (revs->bloom_keys_nr && !nth_parent) { bloom_ret = check_maybe_different_in_bloom_filter(revs, commit); if (bloom_ret == 0) diff --git a/revision.h b/revision.h index 7c026fe41f..abbfb4ab59 100644 --- a/revision.h +++ b/revision.h @@ -295,8 +295,10 @@ struct rev_info { struct topo_walk_info *topo_walk_info; /* Commit graph bloom filter fields */ - /* The bloom filter key for the pathspec */ - struct bloom_key *bloom_key; + /* The bloom filter key(s) for the pathspec */ + struct bloom_key *bloom_keys; + int bloom_keys_nr; + /* * The bloom filter settings used to generate the key. * This is loaded from the commit-graph being used. diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh index 29338f36bf..4892364e74 100755 --- a/t/t4216-log-bloom.sh +++ b/t/t4216-log-bloom.sh @@ -146,7 +146,7 @@ test_expect_success 'setup - add commit-graph to the chain with Bloom filters' ' test_bloom_filters_used_when_some_filters_are_missing () { log_args=$1 - bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"maybe\":8,\"definitely_not\":6" + bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"maybe\":6,\"definitely_not\":8" setup "$log_args" && grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" && test_cmp log_wo_bloom log_w_bloom