From a1bf5ca29f7ebb41e24318147518fc7c738303e5 Mon Sep 17 00:00:00 2001 From: Han Xin Date: Sat, 11 Jun 2022 10:44:16 +0800 Subject: [PATCH 1/6] unpack-objects: low memory footprint for get_data() in dry_run mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As the name implies, "get_data(size)" will allocate and return a given amount of memory. Allocating memory for a large blob object may cause the system to run out of memory. Before preparing to replace calling of "get_data()" to unpack large blob objects in latter commits, refactor "get_data()" to reduce memory footprint for dry_run mode. Because in dry_run mode, "get_data()" is only used to check the integrity of data, and the returned buffer is not used at all, we can allocate a smaller buffer and use it as zstream output. Make the function return NULL in the dry-run mode, as no callers use the returned buffer. The "find [...]objects/?? -type f | wc -l" test idiom being used here is adapted from the same "find" use added to another test in d9545c7f465 (fast-import: implement unpack limit, 2016-04-25). Suggested-by: Jiang Xin Signed-off-by: Han Xin Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- builtin/unpack-objects.c | 37 ++++++++++++++++++++--------- t/t5351-unpack-large-objects.sh | 41 +++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 11 deletions(-) create mode 100755 t/t5351-unpack-large-objects.sh diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index 56d05e2725..32e8b47059 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -97,15 +97,27 @@ static void use(int bytes) display_throughput(progress, consumed_bytes); } +/* + * Decompress zstream from the standard input into a newly + * allocated buffer of specified size and return the buffer. + * The caller is responsible to free the returned buffer. + * + * But for dry_run mode, "get_data()" is only used to check the + * integrity of data, and the returned buffer is not used at all. + * Therefore, in dry_run mode, "get_data()" will release the small + * allocated buffer which is reused to hold temporary zstream output + * and return NULL instead of returning garbage data. + */ static void *get_data(unsigned long size) { git_zstream stream; - void *buf = xmallocz(size); + unsigned long bufsize = dry_run && size > 8192 ? 8192 : size; + void *buf = xmallocz(bufsize); memset(&stream, 0, sizeof(stream)); stream.next_out = buf; - stream.avail_out = size; + stream.avail_out = bufsize; stream.next_in = fill(1); stream.avail_in = len; git_inflate_init(&stream); @@ -125,8 +137,17 @@ static void *get_data(unsigned long size) } stream.next_in = fill(1); stream.avail_in = len; + if (dry_run) { + /* reuse the buffer in dry_run mode */ + stream.next_out = buf; + stream.avail_out = bufsize > size - stream.total_out ? + size - stream.total_out : + bufsize; + } } git_inflate_end(&stream); + if (dry_run) + FREE_AND_NULL(buf); return buf; } @@ -326,10 +347,8 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size, { void *buf = get_data(size); - if (!dry_run && buf) + if (buf) write_object(nr, type, buf, size); - else - free(buf); } static int resolve_against_held(unsigned nr, const struct object_id *base, @@ -359,10 +378,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size, oidread(&base_oid, fill(the_hash_algo->rawsz)); use(the_hash_algo->rawsz); delta_data = get_data(delta_size); - if (dry_run || !delta_data) { - free(delta_data); + if (!delta_data) return; - } if (has_object_file(&base_oid)) ; /* Ok we have this one */ else if (resolve_against_held(nr, &base_oid, @@ -398,10 +415,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size, die("offset value out of bound for delta base object"); delta_data = get_data(delta_size); - if (dry_run || !delta_data) { - free(delta_data); + if (!delta_data) return; - } lo = 0; hi = nr; while (lo < hi) { diff --git a/t/t5351-unpack-large-objects.sh b/t/t5351-unpack-large-objects.sh new file mode 100755 index 0000000000..8d84313221 --- /dev/null +++ b/t/t5351-unpack-large-objects.sh @@ -0,0 +1,41 @@ +#!/bin/sh +# +# Copyright (c) 2022 Han Xin +# + +test_description='git unpack-objects with large objects' + +. ./test-lib.sh + +prepare_dest () { + test_when_finished "rm -rf dest.git" && + git init --bare dest.git +} + +test_expect_success "create large objects (1.5 MB) and PACK" ' + test-tool genrandom foo 1500000 >big-blob && + test_commit --append foo big-blob && + test-tool genrandom bar 1500000 >big-blob && + test_commit --append bar big-blob && + PACK=$(echo HEAD | git pack-objects --revs pack) +' + +test_expect_success 'set memory limitation to 1MB' ' + GIT_ALLOC_LIMIT=1m && + export GIT_ALLOC_LIMIT +' + +test_expect_success 'unpack-objects failed under memory limitation' ' + prepare_dest && + test_must_fail git -C dest.git unpack-objects err && + grep "fatal: attempting to allocate" err +' + +test_expect_success 'unpack-objects works with memory limitation in dry-run mode' ' + prepare_dest && + git -C dest.git unpack-objects -n Date: Sat, 11 Jun 2022 10:44:17 +0800 Subject: [PATCH 2/6] object-file.c: refactor write_loose_object() to several steps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When writing a large blob using "write_loose_object()", we have to pass a buffer with the whole content of the blob, and this behavior will consume lots of memory and may cause OOM. We will introduce a stream version function ("stream_loose_object()") in later commit to resolve this issue. Before introducing that streaming function, do some refactoring on "write_loose_object()" to reuse code for both versions. Rewrite "write_loose_object()" as follows: 1. Figure out a path for the (temp) object file. This step is only used in "write_loose_object()". 2. Move common steps for starting to write loose objects into a new function "start_loose_object_common()". 3. Compress data. 4. Move common steps for ending zlib stream into a new function "end_loose_object_common()". 5. Close fd and finalize the object file. Helped-by: Ævar Arnfjörð Bjarmason Helped-by: Jiang Xin Signed-off-by: Han Xin Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- object-file.c | 98 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 74 insertions(+), 24 deletions(-) diff --git a/object-file.c b/object-file.c index 79eb8339b6..b5bce03274 100644 --- a/object-file.c +++ b/object-file.c @@ -1951,6 +1951,74 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename) return fd; } +/** + * Common steps for loose object writers to start writing loose + * objects: + * + * - Create tmpfile for the loose object. + * - Setup zlib stream for compression. + * - Start to feed header to zlib stream. + * + * Returns a "fd", which should later be provided to + * end_loose_object_common(). + */ +static int start_loose_object_common(struct strbuf *tmp_file, + const char *filename, unsigned flags, + git_zstream *stream, + unsigned char *buf, size_t buflen, + git_hash_ctx *c, + char *hdr, int hdrlen) +{ + int fd; + + fd = create_tmpfile(tmp_file, filename); + if (fd < 0) { + if (flags & HASH_SILENT) + return -1; + else if (errno == EACCES) + return error(_("insufficient permission for adding " + "an object to repository database %s"), + get_object_directory()); + else + return error_errno( + _("unable to create temporary file")); + } + + /* Setup zlib stream for compression */ + git_deflate_init(stream, zlib_compression_level); + stream->next_out = buf; + stream->avail_out = buflen; + the_hash_algo->init_fn(c); + + /* Start to feed header to zlib stream */ + stream->next_in = (unsigned char *)hdr; + stream->avail_in = hdrlen; + while (git_deflate(stream, 0) == Z_OK) + ; /* nothing */ + the_hash_algo->update_fn(c, hdr, hdrlen); + + return fd; +} + +/** + * Common steps for loose object writers to end writing loose objects: + * + * - End the compression of zlib stream. + * - Get the calculated oid to "oid". + */ +static int end_loose_object_common(git_hash_ctx *c, git_zstream *stream, + struct object_id *oid) +{ + int ret; + + ret = git_deflate_end_gently(stream); + if (ret != Z_OK) + return ret; + the_hash_algo->final_oid_fn(oid, c); + + return Z_OK; +} + static int write_loose_object(const struct object_id *oid, char *hdr, int hdrlen, const void *buf, unsigned long len, time_t mtime, unsigned flags) @@ -1968,28 +2036,11 @@ static int write_loose_object(const struct object_id *oid, char *hdr, loose_object_path(the_repository, &filename, oid); - fd = create_tmpfile(&tmp_file, filename.buf); - if (fd < 0) { - if (flags & HASH_SILENT) - return -1; - else if (errno == EACCES) - return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory()); - else - return error_errno(_("unable to create temporary file")); - } - - /* Set it up */ - git_deflate_init(&stream, zlib_compression_level); - stream.next_out = compressed; - stream.avail_out = sizeof(compressed); - the_hash_algo->init_fn(&c); - - /* First header.. */ - stream.next_in = (unsigned char *)hdr; - stream.avail_in = hdrlen; - while (git_deflate(&stream, 0) == Z_OK) - ; /* nothing */ - the_hash_algo->update_fn(&c, hdr, hdrlen); + fd = start_loose_object_common(&tmp_file, filename.buf, flags, + &stream, compressed, sizeof(compressed), + &c, hdr, hdrlen); + if (fd < 0) + return -1; /* Then the data itself.. */ stream.next_in = (void *)buf; @@ -2007,11 +2058,10 @@ static int write_loose_object(const struct object_id *oid, char *hdr, if (ret != Z_STREAM_END) die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid), ret); - ret = git_deflate_end_gently(&stream); + ret = end_loose_object_common(&c, &stream, ¶no_oid); if (ret != Z_OK) die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid), ret); - the_hash_algo->final_oid_fn(¶no_oid, &c); if (!oideq(oid, ¶no_oid)) die(_("confused by unstable object source data for %s"), oid_to_hex(oid)); From 21e7d8814063d8c5ada74464e30bf0c5188f1f80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Sat, 11 Jun 2022 10:44:18 +0800 Subject: [PATCH 3/6] object-file.c: factor out deflate part of write_loose_object() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split out the part of write_loose_object() that deals with calling git_deflate() into a utility function, a subsequent commit will introduce another function that'll make use of it. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- object-file.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/object-file.c b/object-file.c index b5bce03274..18dbf2a4e4 100644 --- a/object-file.c +++ b/object-file.c @@ -2000,6 +2000,28 @@ static int start_loose_object_common(struct strbuf *tmp_file, return fd; } +/** + * Common steps for the inner git_deflate() loop for writing loose + * objects. Returns what git_deflate() returns. + */ +static int write_loose_object_common(git_hash_ctx *c, + git_zstream *stream, const int flush, + unsigned char *in0, const int fd, + unsigned char *compressed, + const size_t compressed_len) +{ + int ret; + + ret = git_deflate(stream, flush ? Z_FINISH : 0); + the_hash_algo->update_fn(c, in0, stream->next_in - in0); + if (write_buffer(fd, compressed, stream->next_out - compressed) < 0) + die(_("unable to write loose object file")); + stream->next_out = compressed; + stream->avail_out = compressed_len; + + return ret; +} + /** * Common steps for loose object writers to end writing loose objects: * @@ -2047,12 +2069,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr, stream.avail_in = len; do { unsigned char *in0 = stream.next_in; - ret = git_deflate(&stream, Z_FINISH); - the_hash_algo->update_fn(&c, in0, stream.next_in - in0); - if (write_buffer(fd, compressed, stream.next_out - compressed) < 0) - die(_("unable to write loose object file")); - stream.next_out = compressed; - stream.avail_out = sizeof(compressed); + + ret = write_loose_object_common(&c, &stream, 1, in0, fd, + compressed, sizeof(compressed)); } while (ret == Z_OK); if (ret != Z_STREAM_END) From 2b6070ac4c408cce86daf21cbce60096f8b78216 Mon Sep 17 00:00:00 2001 From: Han Xin Date: Sat, 11 Jun 2022 10:44:19 +0800 Subject: [PATCH 4/6] object-file.c: add "stream_loose_object()" to handle large object MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we want unpack and write a loose object using "write_loose_object", we have to feed it with a buffer with the same size of the object, which will consume lots of memory and may cause OOM. This can be improved by feeding data to "stream_loose_object()" in a stream. Add a new function "stream_loose_object()", which is a stream version of "write_loose_object()" but with a low memory footprint. We will use this function to unpack large blob object in later commit. Another difference with "write_loose_object()" is that we have no chance to run "write_object_file_prepare()" to calculate the oid in advance. In "write_loose_object()", we know the oid and we can write the temporary file in the same directory as the final object, but for an object with an undetermined oid, we don't know the exact directory for the object. Still, we need to save the temporary file we're preparing somewhere. We'll do that in the top-level ".git/objects/" directory (or whatever "GIT_OBJECT_DIRECTORY" is set to). Once we've streamed it we'll know the OID, and will move it to its canonical path. "freshen_packed_object()" or "freshen_loose_object()" will be called inside "stream_loose_object()" after obtaining the "oid". After the temporary file is written, we wants to mark the object to recent and we may find that where indeed is already the object. We should remove the temporary and do not leave a new copy of the object. Helped-by: René Scharfe Helped-by: Ævar Arnfjörð Bjarmason Helped-by: Jiang Xin Signed-off-by: Han Xin Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- object-file.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++ object-store.h | 8 ++++ 2 files changed, 112 insertions(+) diff --git a/object-file.c b/object-file.c index 18dbf2a4e4..2ca2576ab1 100644 --- a/object-file.c +++ b/object-file.c @@ -2119,6 +2119,110 @@ static int freshen_packed_object(const struct object_id *oid) return 1; } +int stream_loose_object(struct input_stream *in_stream, size_t len, + struct object_id *oid) +{ + int fd, ret, err = 0, flush = 0; + unsigned char compressed[4096]; + git_zstream stream; + git_hash_ctx c; + struct strbuf tmp_file = STRBUF_INIT; + struct strbuf filename = STRBUF_INIT; + int dirlen; + char hdr[MAX_HEADER_LEN]; + int hdrlen; + + if (batch_fsync_enabled(FSYNC_COMPONENT_LOOSE_OBJECT)) + prepare_loose_object_bulk_checkin(); + + /* Since oid is not determined, save tmp file to odb path. */ + strbuf_addf(&filename, "%s/", get_object_directory()); + hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len); + + /* + * Common steps for write_loose_object and stream_loose_object to + * start writing loose objects: + * + * - Create tmpfile for the loose object. + * - Setup zlib stream for compression. + * - Start to feed header to zlib stream. + */ + fd = start_loose_object_common(&tmp_file, filename.buf, 0, + &stream, compressed, sizeof(compressed), + &c, hdr, hdrlen); + if (fd < 0) { + err = -1; + goto cleanup; + } + + /* Then the data itself.. */ + do { + unsigned char *in0 = stream.next_in; + + if (!stream.avail_in && !in_stream->is_finished) { + const void *in = in_stream->read(in_stream, &stream.avail_in); + stream.next_in = (void *)in; + in0 = (unsigned char *)in; + /* All data has been read. */ + if (in_stream->is_finished) + flush = 1; + } + ret = write_loose_object_common(&c, &stream, flush, in0, fd, + compressed, sizeof(compressed)); + /* + * Unlike write_loose_object(), we do not have the entire + * buffer. If we get Z_BUF_ERROR due to too few input bytes, + * then we'll replenish them in the next input_stream->read() + * call when we loop. + */ + } while (ret == Z_OK || ret == Z_BUF_ERROR); + + if (stream.total_in != len + hdrlen) + die(_("write stream object %ld != %"PRIuMAX), stream.total_in, + (uintmax_t)len + hdrlen); + + /* + * Common steps for write_loose_object and stream_loose_object to + * end writing loose oject: + * + * - End the compression of zlib stream. + * - Get the calculated oid. + */ + if (ret != Z_STREAM_END) + die(_("unable to stream deflate new object (%d)"), ret); + ret = end_loose_object_common(&c, &stream, oid); + if (ret != Z_OK) + die(_("deflateEnd on stream object failed (%d)"), ret); + close_loose_object(fd, tmp_file.buf); + + if (freshen_packed_object(oid) || freshen_loose_object(oid)) { + unlink_or_warn(tmp_file.buf); + goto cleanup; + } + + loose_object_path(the_repository, &filename, oid); + + /* We finally know the object path, and create the missing dir. */ + dirlen = directory_size(filename.buf); + if (dirlen) { + struct strbuf dir = STRBUF_INIT; + strbuf_add(&dir, filename.buf, dirlen); + + if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) { + err = error_errno(_("unable to create directory %s"), dir.buf); + strbuf_release(&dir); + goto cleanup; + } + strbuf_release(&dir); + } + + err = finalize_object_file(tmp_file.buf, filename.buf); +cleanup: + strbuf_release(&tmp_file); + strbuf_release(&filename); + return err; +} + int write_object_file_flags(const void *buf, unsigned long len, enum object_type type, struct object_id *oid, unsigned flags) diff --git a/object-store.h b/object-store.h index 539ea43904..5222ee5460 100644 --- a/object-store.h +++ b/object-store.h @@ -46,6 +46,12 @@ struct object_directory { char *path; }; +struct input_stream { + const void *(*read)(struct input_stream *, unsigned long *len); + void *data; + int is_finished; +}; + KHASH_INIT(odb_path_map, const char * /* key: odb_path */, struct object_directory *, 1, fspathhash, fspatheq) @@ -269,6 +275,8 @@ static inline int write_object_file(const void *buf, unsigned long len, int write_object_file_literally(const void *buf, unsigned long len, const char *type, struct object_id *oid, unsigned flags); +int stream_loose_object(struct input_stream *in_stream, size_t len, + struct object_id *oid); /* * Add an object file to the in-memory object store, without writing it From 3c3ca0b0c193e4f2cdf281b5e20ddb75c555f8e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Sat, 11 Jun 2022 10:44:20 +0800 Subject: [PATCH 5/6] core doc: modernize core.bigFileThreshold documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The core.bigFileThreshold documentation has been largely unchanged since 5eef828bc03 (fast-import: Stream very large blobs directly to pack, 2010-02-01). But since then this setting has been expanded to affect a lot more than that description indicated. Most notably in how "git diff" treats them, see 6bf3b813486 (diff --stat: mark any file larger than core.bigfilethreshold binary, 2014-08-16). In addition to that, numerous commands and APIs make use of a streaming mode for files above this threshold. So let's attempt to summarize 12 years of changes in behavior, which can be seen with: git log --oneline -Gbig_file_thre 5eef828bc03.. -- '*.c' To do that turn this into a bullet-point list. The summary Han Xin produced in [1] helped a lot, but is a bit too detailed for documentation aimed at users. Let's instead summarize how user-observable behavior differs, and generally describe how we tend to stream these files in various commands. 1. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/ Helped-by: Han Xin Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- Documentation/config/core.txt | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt index 41e330f306..87e4c04836 100644 --- a/Documentation/config/core.txt +++ b/Documentation/config/core.txt @@ -444,17 +444,32 @@ You probably do not need to adjust this value. Common unit suffixes of 'k', 'm', or 'g' are supported. core.bigFileThreshold:: - Files larger than this size are stored deflated, without - attempting delta compression. Storing large files without - delta compression avoids excessive memory usage, at the - slight expense of increased disk usage. Additionally files - larger than this size are always treated as binary. + The size of files considered "big", which as discussed below + changes the behavior of numerous git commands, as well as how + such files are stored within the repository. The default is + 512 MiB. Common unit suffixes of 'k', 'm', or 'g' are + supported. + -Default is 512 MiB on all platforms. This should be reasonable -for most projects as source code and other text files can still -be delta compressed, but larger binary media files won't be. +Files above the configured limit will be: + -Common unit suffixes of 'k', 'm', or 'g' are supported. +* Stored deflated in packfiles, without attempting delta compression. ++ +The default limit is primarily set with this use-case in mind. With it, +most projects will have their source code and other text files delta +compressed, but not larger binary media files. ++ +Storing large files without delta compression avoids excessive memory +usage, at the slight expense of increased disk usage. ++ +* Will be treated as if they were labeled "binary" (see + linkgit:gitattributes[5]). e.g. linkgit:git-log[1] and + linkgit:git-diff[1] will not compute diffs for files above this limit. ++ +* Will generally be streamed when written, which avoids excessive +memory usage, at the cost of some fixed overhead. Commands that make +use of this include linkgit:git-archive[1], +linkgit:git-fast-import[1], linkgit:git-index-pack[1] and +linkgit:git-fsck[1]. core.excludesFile:: Specifies the pathname to the file that contains patterns to From aaf81223f48f710a3b9a64cc84fac93deed806b6 Mon Sep 17 00:00:00 2001 From: Han Xin Date: Sat, 11 Jun 2022 10:44:21 +0800 Subject: [PATCH 6/6] unpack-objects: use stream_loose_object() to unpack large objects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make use of the stream_loose_object() function introduced in the preceding commit to unpack large objects. Before this we'd need to malloc() the size of the blob before unpacking it, which could cause OOM with very large blobs. We could use the new streaming interface to unpack all blobs, but doing so would be much slower, as demonstrated e.g. with this benchmark using git-hyperfine[0]: rm -rf /tmp/scalar.git && git clone --bare https://github.com/Microsoft/scalar.git /tmp/scalar.git && mv /tmp/scalar.git/objects/pack/*.pack /tmp/scalar.git/my.pack && git hyperfine \ -r 2 --warmup 1 \ -L rev origin/master,HEAD -L v "10,512,1k,1m" \ -s 'make' \ -p 'git init --bare dest.git' \ -c 'rm -rf dest.git' \ './git -C dest.git -c core.bigFileThreshold={v} unpack-objects &1 | grep Maximum' Using this test we'll always use >100MB of memory on origin/master (around ~105MB), but max out at e.g. ~55MB if we set core.bigFileThreshold=50m. The relevant "Maximum resident set size" lines were manually added below the relevant benchmark: '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects &1 | grep Maximum' in 'origin/master' ran Maximum resident set size (kbytes): 107080 1.02 ± 0.78 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects &1 | grep Maximum' in 'origin/master' Maximum resident set size (kbytes): 106968 1.09 ± 0.79 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects &1 | grep Maximum' in 'origin/master' Maximum resident set size (kbytes): 107032 1.42 ± 1.07 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects &1 | grep Maximum' in 'HEAD' Maximum resident set size (kbytes): 107072 1.83 ± 1.02 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects &1 | grep Maximum' in 'HEAD' Maximum resident set size (kbytes): 55704 2.16 ± 1.19 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects &1 | grep Maximum' in 'HEAD' Maximum resident set size (kbytes): 4564 This shows that if you have enough memory this new streaming method is slower the lower you set the streaming threshold, but the benefit is more bounded memory use. An earlier version of this patch introduced a new "core.bigFileStreamingThreshold" instead of re-using the existing "core.bigFileThreshold" variable[1]. As noted in a detailed overview of its users in [2] using it has several different meanings. Still, we consider it good enough to simply re-use it. While it's possible that someone might want to e.g. consider objects "small" for the purposes of diffing but "big" for the purposes of writing them such use-cases are probably too obscure to worry about. We can always split up "core.bigFileThreshold" in the future if there's a need for that. 0. https://github.com/avar/git-hyperfine/ 1. https://lore.kernel.org/git/20211210103435.83656-1-chiyutianyi@gmail.com/ 2. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/ Helped-by: Ævar Arnfjörð Bjarmason Helped-by: Derrick Stolee Helped-by: Jiang Xin Signed-off-by: Han Xin Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- Documentation/config/core.txt | 4 +- builtin/unpack-objects.c | 69 ++++++++++++++++++++++++++++++++- t/t5351-unpack-large-objects.sh | 43 ++++++++++++++++++-- 3 files changed, 109 insertions(+), 7 deletions(-) diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt index 87e4c04836..3ea3124f7f 100644 --- a/Documentation/config/core.txt +++ b/Documentation/config/core.txt @@ -468,8 +468,8 @@ usage, at the slight expense of increased disk usage. * Will generally be streamed when written, which avoids excessive memory usage, at the cost of some fixed overhead. Commands that make use of this include linkgit:git-archive[1], -linkgit:git-fast-import[1], linkgit:git-index-pack[1] and -linkgit:git-fsck[1]. +linkgit:git-fast-import[1], linkgit:git-index-pack[1], +linkgit:git-unpack-objects[1] and linkgit:git-fsck[1]. core.excludesFile:: Specifies the pathname to the file that contains patterns to diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index 32e8b47059..43789b8ef2 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -351,6 +351,68 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size, write_object(nr, type, buf, size); } +struct input_zstream_data { + git_zstream *zstream; + unsigned char buf[8192]; + int status; +}; + +static const void *feed_input_zstream(struct input_stream *in_stream, + unsigned long *readlen) +{ + struct input_zstream_data *data = in_stream->data; + git_zstream *zstream = data->zstream; + void *in = fill(1); + + if (in_stream->is_finished) { + *readlen = 0; + return NULL; + } + + zstream->next_out = data->buf; + zstream->avail_out = sizeof(data->buf); + zstream->next_in = in; + zstream->avail_in = len; + + data->status = git_inflate(zstream, 0); + + in_stream->is_finished = data->status != Z_OK; + use(len - zstream->avail_in); + *readlen = sizeof(data->buf) - zstream->avail_out; + + return data->buf; +} + +static void stream_blob(unsigned long size, unsigned nr) +{ + git_zstream zstream = { 0 }; + struct input_zstream_data data = { 0 }; + struct input_stream in_stream = { + .read = feed_input_zstream, + .data = &data, + }; + struct obj_info *info = &obj_list[nr]; + + data.zstream = &zstream; + git_inflate_init(&zstream); + + if (stream_loose_object(&in_stream, size, &info->oid)) + die(_("failed to write object in stream")); + + if (data.status != Z_STREAM_END) + die(_("inflate returned (%d)"), data.status); + git_inflate_end(&zstream); + + if (strict) { + struct blob *blob = lookup_blob(the_repository, &info->oid); + + if (!blob) + die(_("invalid blob object from stream")); + blob->object.flags |= FLAG_WRITTEN; + } + info->obj = NULL; +} + static int resolve_against_held(unsigned nr, const struct object_id *base, void *delta_data, unsigned long delta_size) { @@ -483,9 +545,14 @@ static void unpack_one(unsigned nr) } switch (type) { + case OBJ_BLOB: + if (!dry_run && size > big_file_threshold) { + stream_blob(size, nr); + return; + } + /* fallthrough */ case OBJ_COMMIT: case OBJ_TREE: - case OBJ_BLOB: case OBJ_TAG: unpack_non_delta_entry(type, size, nr); return; diff --git a/t/t5351-unpack-large-objects.sh b/t/t5351-unpack-large-objects.sh index 8d84313221..8ce8aa3b14 100755 --- a/t/t5351-unpack-large-objects.sh +++ b/t/t5351-unpack-large-objects.sh @@ -9,7 +9,8 @@ test_description='git unpack-objects with large objects' prepare_dest () { test_when_finished "rm -rf dest.git" && - git init --bare dest.git + git init --bare dest.git && + git -C dest.git config core.bigFileThreshold "$1" } test_expect_success "create large objects (1.5 MB) and PACK" ' @@ -17,7 +18,10 @@ test_expect_success "create large objects (1.5 MB) and PACK" ' test_commit --append foo big-blob && test-tool genrandom bar 1500000 >big-blob && test_commit --append bar big-blob && - PACK=$(echo HEAD | git pack-objects --revs pack) + PACK=$(echo HEAD | git pack-objects --revs pack) && + git verify-pack -v pack-$PACK.pack >out && + sed -n -e "s/^\([0-9a-f][0-9a-f]*\).*\(commit\|tree\|blob\).*/\1/p" \ + obj-list ' test_expect_success 'set memory limitation to 1MB' ' @@ -26,16 +30,47 @@ test_expect_success 'set memory limitation to 1MB' ' ' test_expect_success 'unpack-objects failed under memory limitation' ' - prepare_dest && + prepare_dest 2m && test_must_fail git -C dest.git unpack-objects err && grep "fatal: attempting to allocate" err ' test_expect_success 'unpack-objects works with memory limitation in dry-run mode' ' - prepare_dest && + prepare_dest 2m && git -C dest.git unpack-objects -n current && + cmp obj-list current +' + +test_expect_success 'do not unpack existing large objects' ' + prepare_dest 1m && + git -C dest.git index-pack --stdin