From 35a9f1e99c5d31635bb78a4f2d498e72c04fc471 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:23 +0000
Subject: [PATCH 01/10] tree-walk.c: don't match submodule entries for
 'submod/anything'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Submodules should be handled the same as regular directories with
respect to the presence of a trailing slash, i.e. commands like:

  git diff rev1 rev2 -- $path
  git rev-list HEAD -- $path

should produce the same output whether $path is 'submod' or 'submod/'.
This has been fixed in commit 74b4f7f277 (tree-walk.c: ignore trailing
slash on submodule in tree_entry_interesting(), 2014-01-23).

Unfortunately, that commit had the unintended side effect to handle
'submod/anything' the same as 'submod' and 'submod/' as well, e.g.:

  $ git log --oneline --name-only -- sha1collisiondetection/whatever
  4125f78222 sha1dc: update from upstream
  sha1collisiondetection
  07a20f569b Makefile: fix unaligned loads in sha1dc with UBSan
  sha1collisiondetection
  23e37f8e9d sha1dc: update from upstream
  sha1collisiondetection
  86cfd61e6b sha1dc: optionally use sha1collisiondetection as a submodule
  sha1collisiondetection

Fix this by rejecting submodules as partial pathnames when their
trailing slash is followed by anything.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 t/t4010-diff-pathspec.sh | 4 +++-
 tree-walk.c              | 9 ++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/t/t4010-diff-pathspec.sh b/t/t4010-diff-pathspec.sh
index e5ca359edf..65cc703c65 100755
--- a/t/t4010-diff-pathspec.sh
+++ b/t/t4010-diff-pathspec.sh
@@ -125,7 +125,9 @@ test_expect_success 'setup submodules' '
 test_expect_success 'diff-tree ignores trailing slash on submodule path' '
 	git diff --name-only HEAD^ HEAD submod >expect &&
 	git diff --name-only HEAD^ HEAD submod/ >actual &&
-	test_cmp expect actual
+	test_cmp expect actual &&
+	git diff --name-only HEAD^ HEAD -- submod/whatever >actual &&
+	test_must_be_empty actual
 '
 
 test_expect_success 'diff multiple wildcard pathspecs' '
diff --git a/tree-walk.c b/tree-walk.c
index bb0ad34c54..0160294712 100644
--- a/tree-walk.c
+++ b/tree-walk.c
@@ -851,7 +851,14 @@ static int match_entry(const struct pathspec_item *item,
 	if (matchlen > pathlen) {
 		if (match[pathlen] != '/')
 			return 0;
-		if (!S_ISDIR(entry->mode) && !S_ISGITLINK(entry->mode))
+		/*
+		 * Reject non-directories as partial pathnames, except
+		 * when match is a submodule with a trailing slash and
+		 * nothing else (to handle 'submod/' and 'submod'
+		 * uniformly).
+		 */
+		if (!S_ISDIR(entry->mode) &&
+		    (!S_ISGITLINK(entry->mode) || matchlen > pathlen + 1))
 			return 0;
 	}
 

From cb9daf16db0e9935179fd995785e94c996267268 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:24 +0000
Subject: [PATCH 02/10] commit-graph: fix parsing the Chunk Lookup table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The commit-graph file format specifies that the chunks may be in any
order.  However, if the OID Lookup chunk happens to be the last one in
the file, then any command attempting to access the commit-graph data
will fail with:

  fatal: invalid commit position. commit-graph is likely corrupt

In this case the error is wrong, the commit-graph file does conform to
the specification, but the parsing of the Chunk Lookup table is a bit
buggy, and leaves the field holding the number of commits in the
commit-graph zero-initialized.

The number of commits in the commit-graph is determined while parsing
the Chunk Lookup table, by dividing the size of the OID Lookup chunk
with the hash size.  However, the Chunk Lookup table doesn't actually
store the size of the chunks, but it stores their starting offset.
Consequently, the size of a chunk can only be calculated by
subtracting the starting offsets of that chunk from the offset of the
subsequent chunk, or in case of the last chunk from the offset
recorded in the terminating label.  This is currenly implemented in a
bit complicated way: as we iterate over the entries of the Chunk
Lookup table, we check the ID of each chunk and store its starting
offset, then we check the ID of the last seen chunk and calculate its
size using its previously saved offset if necessary (at the moment
it's only necessary for the OID Lookup chunk).  Alas, while parsing
the Chunk Lookup table we only interate through the "real" chunks, but
never look at the terminating label, thus don't even check whether
it's necessary to calulate the size of the last chunk.  Consequently,
if the OID Lookup chunk is the last one, then we don't calculate its
size and turn don't run the piece of code determining the number of
commits in the commit graph, leaving the field holding that number
unchanged (i.e. zero-initialized), eventually triggering the sanity
check in load_oid_from_graph().

Fix this by iterating through all entries in the Chunk Lookup table,
including the terminating label.

Note that this is the minimal fix, suitable for the maintenance track.
A better fix would be to simplify how the chunk sizes are calculated,
but that is a more invasive change, less suitable for 'maint', so that
will be done in later patches.

This additional flexibility of scanning more chunks breaks a test for
"git commit-graph verify" so alter that test to mutate the commit-graph
to have an even lower chunk count.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c          | 2 +-
 t/t5318-commit-graph.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 196e817a84..7807d94562 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -277,7 +277,7 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
 	last_chunk_id = 0;
 	last_chunk_offset = 8;
 	chunk_lookup = data + 8;
-	for (i = 0; i < graph->num_chunks; i++) {
+	for (i = 0; i <= graph->num_chunks; i++) {
 		uint32_t chunk_id;
 		uint64_t chunk_offset;
 		int chunk_repeated = 0;
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 18304a65e4..79e7fbcd40 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -488,7 +488,7 @@ test_expect_success 'detect bad hash version' '
 '
 
 test_expect_success 'detect low chunk count' '
-	corrupt_graph_and_verify $GRAPH_BYTE_CHUNK_COUNT "\02" \
+	corrupt_graph_and_verify $GRAPH_BYTE_CHUNK_COUNT "\01" \
 		"missing the .* chunk"
 '
 

From 6141cdfdcbe9e3edf25467582c5f32658ae79f40 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:25 +0000
Subject: [PATCH 03/10] commit-graph-format.txt: all multi-byte numbers are in
 network byte order
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The commit-graph format specifies that "All 4-byte numbers are in
network order", but the commit-graph contains 8-byte integers as well
(file offsets in the Chunk Lookup table), and their byte order is
unspecified.

Clarify that all multi-byte integers are in network byte order.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 Documentation/technical/commit-graph-format.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/technical/commit-graph-format.txt b/Documentation/technical/commit-graph-format.txt
index 1beef17182..440541045d 100644
--- a/Documentation/technical/commit-graph-format.txt
+++ b/Documentation/technical/commit-graph-format.txt
@@ -32,7 +32,7 @@ the body into "chunks" and provide a binary lookup table at the beginning
 of the body. The header includes certain values, such as number of chunks
 and hash type.
 
-All 4-byte numbers are in network order.
+All multi-byte numbers are in network byte order.
 
 HEADER:
 

From 1df15f8dee5c637c9013d11d5b8e72e189a04f06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:26 +0000
Subject: [PATCH 04/10] commit-slab: add a function to deep free entries on the
 slab
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

clear_##slabname() frees only the memory allocated for a commit slab
itself, but entries in the commit slab might own additional memory
outside the slab that should be freed as well.  We already have (at
least) one such commit slab, and this patch series is about to add one
more.

To free all additional memory owned by entries on the commit slab the
user of such a slab could iterate over all commits it knows about,
peek whether there is a valid entry associated with each commit, and
free the additional memory, if any.  Or it could rely on intimate
knowledge about the internals of the commit slab implementation, and
could itself iterate directly through all entries in the slab, and
free the additional memory.  Or it could just leak the additional
memory...

Introduce deep_clear_##slabname() to allow releasing memory owned by
commit slab entries by invoking the 'void free_fn(elemtype *ptr)'
function specified as parameter for each entry in the slab.

Use it in get_shallow_commits() in 'shallow.c' to replace an
open-coded iteration over a commit slab's entries.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-slab-decl.h |  1 +
 commit-slab-impl.h | 13 +++++++++++++
 commit-slab.h      | 10 ++++++++++
 shallow.c          | 14 +++++---------
 4 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/commit-slab-decl.h b/commit-slab-decl.h
index adc7b46c83..286164b7e2 100644
--- a/commit-slab-decl.h
+++ b/commit-slab-decl.h
@@ -32,6 +32,7 @@ struct slabname {							\
 void init_ ##slabname## _with_stride(struct slabname *s, unsigned stride); \
 void init_ ##slabname(struct slabname *s);				\
 void clear_ ##slabname(struct slabname *s);				\
+void deep_clear_ ##slabname(struct slabname *s, void (*free_fn)(elemtype *ptr)); \
 elemtype *slabname## _at_peek(struct slabname *s, const struct commit *c, int add_if_missing); \
 elemtype *slabname## _at(struct slabname *s, const struct commit *c);	\
 elemtype *slabname## _peek(struct slabname *s, const struct commit *c)
diff --git a/commit-slab-impl.h b/commit-slab-impl.h
index 5c0eb91a5d..557738df27 100644
--- a/commit-slab-impl.h
+++ b/commit-slab-impl.h
@@ -38,6 +38,19 @@ scope void clear_ ##slabname(struct slabname *s)			\
 	FREE_AND_NULL(s->slab);						\
 }									\
 									\
+scope void deep_clear_ ##slabname(struct slabname *s, void (*free_fn)(elemtype *)) \
+{									\
+	unsigned int i;							\
+	for (i = 0; i < s->slab_count; i++) {				\
+		unsigned int j;						\
+		if (!s->slab[i])					\
+			continue;					\
+		for (j = 0; j < s->slab_size; j++)			\
+			free_fn(&s->slab[i][j * s->stride]);		\
+	}								\
+	clear_ ##slabname(s);						\
+}									\
+									\
 scope elemtype *slabname## _at_peek(struct slabname *s,			\
 						  const struct commit *c, \
 						  int add_if_missing)   \
diff --git a/commit-slab.h b/commit-slab.h
index 05b3f2804e..8e72a30536 100644
--- a/commit-slab.h
+++ b/commit-slab.h
@@ -47,6 +47,16 @@
  *
  *   Call this function before the slab falls out of scope to avoid
  *   leaking memory.
+ *
+ * - void deep_clear_indegree(struct indegree *, void (*free_fn)(int*))
+ *
+ *   Empties the slab, similar to clear_indegree(), but in addition it
+ *   calls the given 'free_fn' for each slab entry to release any
+ *   additional memory that might be owned by the entry (but not the
+ *   entry itself!).
+ *   Note that 'free_fn' might be called even for entries for which no
+ *   indegree_at() call has been made; in this case 'free_fn' is invoked
+ *   with a pointer to a zero-initialized location.
  */
 
 #define define_commit_slab(slabname, elemtype) \
diff --git a/shallow.c b/shallow.c
index 7fd04afed1..c4ac8a7327 100644
--- a/shallow.c
+++ b/shallow.c
@@ -84,6 +84,10 @@ int is_repository_shallow(struct repository *r)
  * supports a "valid" flag.
  */
 define_commit_slab(commit_depth, int *);
+static void free_depth_in_slab(int **ptr)
+{
+	FREE_AND_NULL(*ptr);
+}
 struct commit_list *get_shallow_commits(struct object_array *heads, int depth,
 		int shallow_flag, int not_shallow_flag)
 {
@@ -150,15 +154,7 @@ struct commit_list *get_shallow_commits(struct object_array *heads, int depth,
 			}
 		}
 	}
-	for (i = 0; i < depths.slab_count; i++) {
-		int j;
-
-		if (!depths.slab[i])
-			continue;
-		for (j = 0; j < depths.slab_size; j++)
-			free(depths.slab[i][j]);
-	}
-	clear_commit_depth(&depths);
+	deep_clear_commit_depth(&depths, free_depth_in_slab);
 
 	return result;
 }

From 0ee3cb888dc4da900b2e701ef8106205d8963ffb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:27 +0000
Subject: [PATCH 05/10] diff.h: drop diff_tree_oid() & friends' return value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ll_diff_tree_oid() has only ever returned 0 [1], so it's return value
is basically useless.  It's only caller diff_tree_oid() has only ever
returned the return value of ll_diff_tree_oid() as-is [2], so its
return value is just as useless.  Most of diff_tree_oid()'s callers
simply ignore its return value, except:

  - diff_root_tree_oid() is a thin wrapper around diff_tree_oid() and
    returns with its return value, but all of diff_root_tree_oid()'s
    callers ignore its return value.

  - rev_compare_tree() and rev_same_tree_as_empty() do look at the
    return value in a condition, but, since the return value is always
    0, the former's < 0 condition is never fulfilled, while the
    latter's >= 0 condition is always fulfilled.

So let's drop the return value of ll_diff_tree_oid(), diff_tree_oid()
and diff_root_tree_oid(), and drop those conditions from
rev_compare_tree() and rev_same_tree_as_empty() as well.

[1] ll_diff_tree_oid() and its ancestors have been returning only 0
    ever since it was introduced as diff_tree() in 9174026cfe (Add
    "diff-tree" program to show which files have changed between two
    trees., 2005-04-09).
[2] diff_tree_oid() traces back to diff-tree.c:main() in 9174026cfe as
    well.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 diff.h      | 10 +++++-----
 revision.c  |  9 +++------
 tree-diff.c | 30 ++++++++++++++----------------
 3 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/diff.h b/diff.h
index 9443dc1b00..e0c0af6286 100644
--- a/diff.h
+++ b/diff.h
@@ -431,11 +431,11 @@ struct combine_diff_path *diff_tree_paths(
 	struct combine_diff_path *p, const struct object_id *oid,
 	const struct object_id **parents_oid, int nparent,
 	struct strbuf *base, struct diff_options *opt);
-int diff_tree_oid(const struct object_id *old_oid,
-		  const struct object_id *new_oid,
-		  const char *base, struct diff_options *opt);
-int diff_root_tree_oid(const struct object_id *new_oid, const char *base,
-		       struct diff_options *opt);
+void diff_tree_oid(const struct object_id *old_oid,
+		   const struct object_id *new_oid,
+		   const char *base, struct diff_options *opt);
+void diff_root_tree_oid(const struct object_id *new_oid, const char *base,
+			struct diff_options *opt);
 
 struct combine_diff_path {
 	struct combine_diff_path *next;
diff --git a/revision.c b/revision.c
index cbf4b61aa6..c644c66091 100644
--- a/revision.c
+++ b/revision.c
@@ -791,9 +791,7 @@ static int rev_compare_tree(struct rev_info *revs,
 
 	tree_difference = REV_TREE_SAME;
 	revs->pruning.flags.has_changes = 0;
-	if (diff_tree_oid(&t1->object.oid, &t2->object.oid, "",
-			   &revs->pruning) < 0)
-		return REV_TREE_DIFFERENT;
+	diff_tree_oid(&t1->object.oid, &t2->object.oid, "", &revs->pruning);
 
 	if (!nth_parent)
 		if (bloom_ret == 1 && tree_difference == REV_TREE_SAME)
@@ -804,7 +802,6 @@ static int rev_compare_tree(struct rev_info *revs,
 
 static int rev_same_tree_as_empty(struct rev_info *revs, struct commit *commit)
 {
-	int retval;
 	struct tree *t1 = get_commit_tree(commit);
 
 	if (!t1)
@@ -812,9 +809,9 @@ static int rev_same_tree_as_empty(struct rev_info *revs, struct commit *commit)
 
 	tree_difference = REV_TREE_SAME;
 	revs->pruning.flags.has_changes = 0;
-	retval = diff_tree_oid(NULL, &t1->object.oid, "", &revs->pruning);
+	diff_tree_oid(NULL, &t1->object.oid, "", &revs->pruning);
 
-	return retval >= 0 && (tree_difference == REV_TREE_SAME);
+	return tree_difference == REV_TREE_SAME;
 }
 
 struct treesame_state {
diff --git a/tree-diff.c b/tree-diff.c
index f3d303c6e5..6ebad1a46f 100644
--- a/tree-diff.c
+++ b/tree-diff.c
@@ -29,9 +29,9 @@ static struct combine_diff_path *ll_diff_tree_paths(
 	struct combine_diff_path *p, const struct object_id *oid,
 	const struct object_id **parents_oid, int nparent,
 	struct strbuf *base, struct diff_options *opt);
-static int ll_diff_tree_oid(const struct object_id *old_oid,
-			    const struct object_id *new_oid,
-			    struct strbuf *base, struct diff_options *opt);
+static void ll_diff_tree_oid(const struct object_id *old_oid,
+			     const struct object_id *new_oid,
+			     struct strbuf *base, struct diff_options *opt);
 
 /*
  * Compare two tree entries, taking into account only path/S_ISDIR(mode),
@@ -679,9 +679,9 @@ static void try_to_follow_renames(const struct object_id *old_oid,
 	q->nr = 1;
 }
 
-static int ll_diff_tree_oid(const struct object_id *old_oid,
-			    const struct object_id *new_oid,
-			    struct strbuf *base, struct diff_options *opt)
+static void ll_diff_tree_oid(const struct object_id *old_oid,
+			     const struct object_id *new_oid,
+			     struct strbuf *base, struct diff_options *opt)
 {
 	struct combine_diff_path phead, *p;
 	pathchange_fn_t pathchange_old = opt->pathchange;
@@ -697,29 +697,27 @@ static int ll_diff_tree_oid(const struct object_id *old_oid,
 	}
 
 	opt->pathchange = pathchange_old;
-	return 0;
 }
 
-int diff_tree_oid(const struct object_id *old_oid,
-		  const struct object_id *new_oid,
-		  const char *base_str, struct diff_options *opt)
+void diff_tree_oid(const struct object_id *old_oid,
+		   const struct object_id *new_oid,
+		   const char *base_str, struct diff_options *opt)
 {
 	struct strbuf base;
-	int retval;
 
 	strbuf_init(&base, PATH_MAX);
 	strbuf_addstr(&base, base_str);
 
-	retval = ll_diff_tree_oid(old_oid, new_oid, &base, opt);
+	ll_diff_tree_oid(old_oid, new_oid, &base, opt);
 	if (!*base_str && opt->flags.follow_renames && diff_might_be_rename())
 		try_to_follow_renames(old_oid, new_oid, &base, opt);
 
 	strbuf_release(&base);
-
-	return retval;
 }
 
-int diff_root_tree_oid(const struct object_id *new_oid, const char *base, struct diff_options *opt)
+void diff_root_tree_oid(const struct object_id *new_oid,
+			const char *base,
+			struct diff_options *opt)
 {
-	return diff_tree_oid(NULL, new_oid, base, opt);
+	diff_tree_oid(NULL, new_oid, base, opt);
 }

From fa7965309e709e6dcd59c38fc6743d17d82d538b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:28 +0000
Subject: [PATCH 06/10] commit-graph: clean up #includes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Our CodingGuidelines says that it's sufficient to include one of
'git-compat-util.h' and 'cache.h', but both 'commit-graph.c' and
'commit-graph.h' include both.  Let's include only 'git-compat-util.h'
to loose a bunch of unnecessary dependencies; but include 'hash.h',
because 'commit-graph.h' does require the definition of 'struct
object_id'.

'commit-graph.h' explicitly includes 'repository.h' and
'string-list.h', but only needs the declaration of a few structs from
them.  Drop these includes and forward-declare the necessary structs
instead.

'commit-graph.c' includes 'dir.h', but doesn't actually use anything
from there, so let's drop that #include as well.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c | 4 +---
 commit-graph.h | 6 +++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 7807d94562..6ed649388d 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -1,7 +1,5 @@
-#include "cache.h"
-#include "config.h"
-#include "dir.h"
 #include "git-compat-util.h"
+#include "config.h"
 #include "lockfile.h"
 #include "pack.h"
 #include "packfile.h"
diff --git a/commit-graph.h b/commit-graph.h
index 39484482cc..881c9b46e5 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -2,9 +2,6 @@
 #define COMMIT_GRAPH_H
 
 #include "git-compat-util.h"
-#include "repository.h"
-#include "string-list.h"
-#include "cache.h"
 #include "object-store.h"
 
 #define GIT_TEST_COMMIT_GRAPH "GIT_TEST_COMMIT_GRAPH"
@@ -22,6 +19,9 @@ void git_test_write_commit_graph_or_die(void);
 
 struct commit;
 struct bloom_filter_settings;
+struct repository;
+struct raw_object_store;
+struct string_list;
 
 char *get_commit_graph_filename(struct object_directory *odb);
 int open_commit_graph(const char *graph_file, int *fd, struct stat *st);

From 2ad4f1a7c43cac4ba52cff56f9090d3322c67224 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:29 +0000
Subject: [PATCH 07/10] commit-graph: simplify parse_commit_graph() #1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While we iterate over all entries of the Chunk Lookup table we make
sure that we don't attempt to read past the end of the mmap-ed
commit-graph file, and check in each iteration that the chunk ID and
offset we are about to read is still within the mmap-ed memory region.
However, these checks in each iteration are not really necessary,
because the number of chunks in the commit-graph file is already known
before this loop from the just parsed commit-graph header.

So let's check that the commit-graph file is large enough for all
entries in the Chunk Lookup table before we start iterating over those
entries, and drop those per-iteration checks.  While at it, take into
account the size of everything that is necessary to have a valid
commit-graph file, i.e. the size of the header, the size of the
mandatory OID Fanout chunk, and the size of the signature in the
trailer as well.

Note that this necessitates the change of the error message as well,
and, consequently, have to update the 'detect incorrect chunk count'
test in 't5318-commit-graph.sh' as well.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c          | 16 +++++++++-------
 t/t5318-commit-graph.sh |  3 ++-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 6ed649388d..9927762f18 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -272,6 +272,15 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
 	graph->data = graph_map;
 	graph->data_len = graph_size;
 
+	if (graph_size < GRAPH_HEADER_SIZE +
+			 (graph->num_chunks + 1) * GRAPH_CHUNKLOOKUP_WIDTH +
+			 GRAPH_FANOUT_SIZE + the_hash_algo->rawsz) {
+		error(_("commit-graph file is too small to hold %u chunks"),
+		      graph->num_chunks);
+		free(graph);
+		return NULL;
+	}
+
 	last_chunk_id = 0;
 	last_chunk_offset = 8;
 	chunk_lookup = data + 8;
@@ -280,13 +289,6 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
 		uint64_t chunk_offset;
 		int chunk_repeated = 0;
 
-		if (data + graph_size - chunk_lookup <
-		    GRAPH_CHUNKLOOKUP_WIDTH) {
-			error(_("commit-graph chunk lookup table entry missing; file may be incomplete"));
-			free(graph);
-			return NULL;
-		}
-
 		chunk_id = get_be32(chunk_lookup + 0);
 		chunk_offset = get_be64(chunk_lookup + 4);
 
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 79e7fbcd40..1073f9e3cf 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -574,7 +574,8 @@ test_expect_success 'detect invalid checksum hash' '
 
 test_expect_success 'detect incorrect chunk count' '
 	corrupt_graph_and_verify $GRAPH_BYTE_CHUNK_COUNT "\377" \
-		"chunk lookup table entry missing" $GRAPH_CHUNK_LOOKUP_OFFSET
+		"commit-graph file is too small to hold [0-9]* chunks" \
+		$GRAPH_CHUNK_LOOKUP_OFFSET
 '
 
 test_expect_success 'git fsck (checks commit-graph)' '

From 5cfa438a76f7f67f34f6f758b62943261ded1fb5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:30 +0000
Subject: [PATCH 08/10] commit-graph: simplify parse_commit_graph() #2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Chunk Lookup table stores the chunks' starting offset in the
commit-graph file, not their sizes.  Consequently, the size of a chunk
can only be calculated by subtracting its offset from the offset of
the subsequent chunk (or that of the terminating label).  This is
currenly implemented in a bit complicated way: as we iterate over the
entries of the Chunk Lookup table, we check the id of each chunk and
store its starting offset, then we check the id of the last seen chunk
and calculate its size using its previously saved offset.  At the
moment there is only one chunk for which we calculate its size, but
this patch series will add more, and the repeated chunk id checks are
not that pretty.

Instead let's read ahead the offset of the next chunk on each
iteration, so we can calculate the size of each chunk right away,
right where we store its starting offset.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 9927762f18..84206f0f51 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -230,8 +230,7 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
 	const unsigned char *data, *chunk_lookup;
 	uint32_t i;
 	struct commit_graph *graph;
-	uint64_t last_chunk_offset;
-	uint32_t last_chunk_id;
+	uint64_t next_chunk_offset;
 	uint32_t graph_signature;
 	unsigned char graph_version, hash_version;
 
@@ -281,18 +280,17 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
 		return NULL;
 	}
 
-	last_chunk_id = 0;
-	last_chunk_offset = 8;
 	chunk_lookup = data + 8;
-	for (i = 0; i <= graph->num_chunks; i++) {
+	next_chunk_offset = get_be64(chunk_lookup + 4);
+	for (i = 0; i < graph->num_chunks; i++) {
 		uint32_t chunk_id;
-		uint64_t chunk_offset;
+		uint64_t chunk_offset = next_chunk_offset;
 		int chunk_repeated = 0;
 
 		chunk_id = get_be32(chunk_lookup + 0);
-		chunk_offset = get_be64(chunk_lookup + 4);
 
 		chunk_lookup += GRAPH_CHUNKLOOKUP_WIDTH;
+		next_chunk_offset = get_be64(chunk_lookup + 4);
 
 		if (chunk_offset > graph_size - the_hash_algo->rawsz) {
 			error(_("commit-graph improper chunk offset %08x%08x"), (uint32_t)(chunk_offset >> 32),
@@ -312,8 +310,11 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
 		case GRAPH_CHUNKID_OIDLOOKUP:
 			if (graph->chunk_oid_lookup)
 				chunk_repeated = 1;
-			else
+			else {
 				graph->chunk_oid_lookup = data + chunk_offset;
+				graph->num_commits = (next_chunk_offset - chunk_offset)
+						     / graph->hash_len;
+			}
 			break;
 
 		case GRAPH_CHUNKID_DATA:
@@ -368,15 +369,6 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
 			free(graph);
 			return NULL;
 		}
-
-		if (last_chunk_id == GRAPH_CHUNKID_OIDLOOKUP)
-		{
-			graph->num_commits = (chunk_offset - last_chunk_offset)
-					     / graph->hash_len;
-		}
-
-		last_chunk_id = chunk_id;
-		last_chunk_offset = chunk_offset;
 	}
 
 	if (graph->chunk_bloom_indexes && graph->chunk_bloom_data) {

From bb4d60e5d536009e0c97bfa7f464416c29c04c81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:31 +0000
Subject: [PATCH 09/10] commit-graph: simplify write_commit_graph_file() #1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In write_commit_graph_file() one block of code fills the array of
chunk IDs, another block of code fills the array of chunk offsets,
then the chunk IDs and offsets are written to the Chunk Lookup table,
and finally a third block of code writes the actual chunks.  In case
of optional chunks like Extra Edge List and Base Graphs List there is
also a condition checking whether that chunk is necessary/desired, and
that same condition is repeated in all those three blocks of code.
This patch series is about to add more optional chunks, so there would
be even more repeated conditions.

Those chunk offsets are relative to the beginning of the file, so they
inherently depend on the size of the Chunk Lookup table, which in turn
depends on the number of chunks that are to be written to the
commit-graph file.  IOW at the time we set the first chunk's ID we
can't yet know its offset, because we don't yet know how many chunks
there are.

Simplify this by initially filling an array of chunk sizes, not
offsets, and calculate the offsets based on the chunk sizes only
later, while we are writing the Chunk Lookup table.  This way we can
fill the arrays of chunk IDs and sizes in one go, eliminating one set
of repeated conditions.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c | 46 +++++++++++++++++-----------------------------
 1 file changed, 17 insertions(+), 29 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 84206f0f51..79cddabcd1 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -1529,10 +1529,11 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	struct hashfile *f;
 	struct lock_file lk = LOCK_INIT;
 	uint32_t chunk_ids[MAX_NUM_CHUNKS + 1];
-	uint64_t chunk_offsets[MAX_NUM_CHUNKS + 1];
+	uint64_t chunk_sizes[MAX_NUM_CHUNKS + 1];
 	const unsigned hashsz = the_hash_algo->rawsz;
 	struct strbuf progress_title = STRBUF_INIT;
 	int num_chunks = 3;
+	uint64_t chunk_offset;
 	struct object_id file_hash;
 	const struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
 
@@ -1573,50 +1574,34 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	}
 
 	chunk_ids[0] = GRAPH_CHUNKID_OIDFANOUT;
+	chunk_sizes[0] = GRAPH_FANOUT_SIZE;
 	chunk_ids[1] = GRAPH_CHUNKID_OIDLOOKUP;
+	chunk_sizes[1] = hashsz * ctx->commits.nr;
 	chunk_ids[2] = GRAPH_CHUNKID_DATA;
+	chunk_sizes[2] = (hashsz + 16) * ctx->commits.nr;
+
 	if (ctx->num_extra_edges) {
 		chunk_ids[num_chunks] = GRAPH_CHUNKID_EXTRAEDGES;
+		chunk_sizes[num_chunks] = 4 * ctx->num_extra_edges;
 		num_chunks++;
 	}
 	if (ctx->changed_paths) {
 		chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMINDEXES;
+		chunk_sizes[num_chunks] = sizeof(uint32_t) * ctx->commits.nr;
 		num_chunks++;
 		chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMDATA;
+		chunk_sizes[num_chunks] = sizeof(uint32_t) * 3
+					  + ctx->total_bloom_filter_data_size;
 		num_chunks++;
 	}
 	if (ctx->num_commit_graphs_after > 1) {
 		chunk_ids[num_chunks] = GRAPH_CHUNKID_BASE;
+		chunk_sizes[num_chunks] = hashsz * (ctx->num_commit_graphs_after - 1);
 		num_chunks++;
 	}
 
 	chunk_ids[num_chunks] = 0;
-
-	chunk_offsets[0] = 8 + (num_chunks + 1) * GRAPH_CHUNKLOOKUP_WIDTH;
-	chunk_offsets[1] = chunk_offsets[0] + GRAPH_FANOUT_SIZE;
-	chunk_offsets[2] = chunk_offsets[1] + hashsz * ctx->commits.nr;
-	chunk_offsets[3] = chunk_offsets[2] + (hashsz + 16) * ctx->commits.nr;
-
-	num_chunks = 3;
-	if (ctx->num_extra_edges) {
-		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
-						4 * ctx->num_extra_edges;
-		num_chunks++;
-	}
-	if (ctx->changed_paths) {
-		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
-						sizeof(uint32_t) * ctx->commits.nr;
-		num_chunks++;
-
-		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
-						sizeof(uint32_t) * 3 + ctx->total_bloom_filter_data_size;
-		num_chunks++;
-	}
-	if (ctx->num_commit_graphs_after > 1) {
-		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
-						hashsz * (ctx->num_commit_graphs_after - 1);
-		num_chunks++;
-	}
+	chunk_sizes[num_chunks] = 0;
 
 	hashwrite_be32(f, GRAPH_SIGNATURE);
 
@@ -1625,13 +1610,16 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	hashwrite_u8(f, num_chunks);
 	hashwrite_u8(f, ctx->num_commit_graphs_after - 1);
 
+	chunk_offset = 8 + (num_chunks + 1) * GRAPH_CHUNKLOOKUP_WIDTH;
 	for (i = 0; i <= num_chunks; i++) {
 		uint32_t chunk_write[3];
 
 		chunk_write[0] = htonl(chunk_ids[i]);
-		chunk_write[1] = htonl(chunk_offsets[i] >> 32);
-		chunk_write[2] = htonl(chunk_offsets[i] & 0xffffffff);
+		chunk_write[1] = htonl(chunk_offset >> 32);
+		chunk_write[2] = htonl(chunk_offset & 0xffffffff);
 		hashwrite(f, chunk_write, 12);
+
+		chunk_offset += chunk_sizes[i];
 	}
 
 	if (ctx->report_progress) {

From 7fbfe07ab4d4e58c0971dac73001b89f180a0af3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:32 +0000
Subject: [PATCH 10/10] commit-graph: simplify write_commit_graph_file() #2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unify the 'chunk_ids' and 'chunk_sizes' arrays into an array of
'struct chunk_info'.  This will allow more cleanups in the following
patches.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 79cddabcd1..887837e882 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -1522,14 +1522,18 @@ static int write_graph_chunk_base(struct hashfile *f,
 	return 0;
 }
 
+struct chunk_info {
+	uint32_t id;
+	uint64_t size;
+};
+
 static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 {
 	uint32_t i;
 	int fd;
 	struct hashfile *f;
 	struct lock_file lk = LOCK_INIT;
-	uint32_t chunk_ids[MAX_NUM_CHUNKS + 1];
-	uint64_t chunk_sizes[MAX_NUM_CHUNKS + 1];
+	struct chunk_info chunks[MAX_NUM_CHUNKS + 1];
 	const unsigned hashsz = the_hash_algo->rawsz;
 	struct strbuf progress_title = STRBUF_INIT;
 	int num_chunks = 3;
@@ -1573,35 +1577,34 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 		f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf);
 	}
 
-	chunk_ids[0] = GRAPH_CHUNKID_OIDFANOUT;
-	chunk_sizes[0] = GRAPH_FANOUT_SIZE;
-	chunk_ids[1] = GRAPH_CHUNKID_OIDLOOKUP;
-	chunk_sizes[1] = hashsz * ctx->commits.nr;
-	chunk_ids[2] = GRAPH_CHUNKID_DATA;
-	chunk_sizes[2] = (hashsz + 16) * ctx->commits.nr;
-
+	chunks[0].id = GRAPH_CHUNKID_OIDFANOUT;
+	chunks[0].size = GRAPH_FANOUT_SIZE;
+	chunks[1].id = GRAPH_CHUNKID_OIDLOOKUP;
+	chunks[1].size = hashsz * ctx->commits.nr;
+	chunks[2].id = GRAPH_CHUNKID_DATA;
+	chunks[2].size = (hashsz + 16) * ctx->commits.nr;
 	if (ctx->num_extra_edges) {
-		chunk_ids[num_chunks] = GRAPH_CHUNKID_EXTRAEDGES;
-		chunk_sizes[num_chunks] = 4 * ctx->num_extra_edges;
+		chunks[num_chunks].id = GRAPH_CHUNKID_EXTRAEDGES;
+		chunks[num_chunks].size = 4 * ctx->num_extra_edges;
 		num_chunks++;
 	}
 	if (ctx->changed_paths) {
-		chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMINDEXES;
-		chunk_sizes[num_chunks] = sizeof(uint32_t) * ctx->commits.nr;
+		chunks[num_chunks].id = GRAPH_CHUNKID_BLOOMINDEXES;
+		chunks[num_chunks].size = sizeof(uint32_t) * ctx->commits.nr;
 		num_chunks++;
-		chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMDATA;
-		chunk_sizes[num_chunks] = sizeof(uint32_t) * 3
+		chunks[num_chunks].id = GRAPH_CHUNKID_BLOOMDATA;
+		chunks[num_chunks].size = sizeof(uint32_t) * 3
 					  + ctx->total_bloom_filter_data_size;
 		num_chunks++;
 	}
 	if (ctx->num_commit_graphs_after > 1) {
-		chunk_ids[num_chunks] = GRAPH_CHUNKID_BASE;
-		chunk_sizes[num_chunks] = hashsz * (ctx->num_commit_graphs_after - 1);
+		chunks[num_chunks].id = GRAPH_CHUNKID_BASE;
+		chunks[num_chunks].size = hashsz * (ctx->num_commit_graphs_after - 1);
 		num_chunks++;
 	}
 
-	chunk_ids[num_chunks] = 0;
-	chunk_sizes[num_chunks] = 0;
+	chunks[num_chunks].id = 0;
+	chunks[num_chunks].size = 0;
 
 	hashwrite_be32(f, GRAPH_SIGNATURE);
 
@@ -1614,12 +1617,12 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	for (i = 0; i <= num_chunks; i++) {
 		uint32_t chunk_write[3];
 
-		chunk_write[0] = htonl(chunk_ids[i]);
+		chunk_write[0] = htonl(chunks[i].id);
 		chunk_write[1] = htonl(chunk_offset >> 32);
 		chunk_write[2] = htonl(chunk_offset & 0xffffffff);
 		hashwrite(f, chunk_write, 12);
 
-		chunk_offset += chunk_sizes[i];
+		chunk_offset += chunks[i].size;
 	}
 
 	if (ctx->report_progress) {