Merge branch 'hx/unpack-streaming'

Allow large objects read from a packstream to be streamed into a
loose object file straight, without having to keep it in-core as a
whole.

* hx/unpack-streaming:
  unpack-objects: use stream_loose_object() to unpack large objects
  core doc: modernize core.bigFileThreshold documentation
  object-file.c: add "stream_loose_object()" to handle large object
  object-file.c: factor out deflate part of write_loose_object()
  object-file.c: refactor write_loose_object() to several steps
  unpack-objects: low memory footprint for get_data() in dry_run mode
This commit is contained in:
Junio C Hamano 2022-07-14 15:03:59 -07:00
commit 73b9ef6ab1
5 changed files with 405 additions and 51 deletions

View File

@ -444,17 +444,32 @@ You probably do not need to adjust this value.
Common unit suffixes of 'k', 'm', or 'g' are supported.
core.bigFileThreshold::
Files larger than this size are stored deflated, without
attempting delta compression. Storing large files without
delta compression avoids excessive memory usage, at the
slight expense of increased disk usage. Additionally files
larger than this size are always treated as binary.
The size of files considered "big", which as discussed below
changes the behavior of numerous git commands, as well as how
such files are stored within the repository. The default is
512 MiB. Common unit suffixes of 'k', 'm', or 'g' are
supported.
+
Default is 512 MiB on all platforms. This should be reasonable
for most projects as source code and other text files can still
be delta compressed, but larger binary media files won't be.
Files above the configured limit will be:
+
Common unit suffixes of 'k', 'm', or 'g' are supported.
* Stored deflated in packfiles, without attempting delta compression.
+
The default limit is primarily set with this use-case in mind. With it,
most projects will have their source code and other text files delta
compressed, but not larger binary media files.
+
Storing large files without delta compression avoids excessive memory
usage, at the slight expense of increased disk usage.
+
* Will be treated as if they were labeled "binary" (see
linkgit:gitattributes[5]). e.g. linkgit:git-log[1] and
linkgit:git-diff[1] will not compute diffs for files above this limit.
+
* Will generally be streamed when written, which avoids excessive
memory usage, at the cost of some fixed overhead. Commands that make
use of this include linkgit:git-archive[1],
linkgit:git-fast-import[1], linkgit:git-index-pack[1],
linkgit:git-unpack-objects[1] and linkgit:git-fsck[1].
core.excludesFile::
Specifies the pathname to the file that contains patterns to

View File

@ -97,15 +97,27 @@ static void use(int bytes)
display_throughput(progress, consumed_bytes);
}
/*
* Decompress zstream from the standard input into a newly
* allocated buffer of specified size and return the buffer.
* The caller is responsible to free the returned buffer.
*
* But for dry_run mode, "get_data()" is only used to check the
* integrity of data, and the returned buffer is not used at all.
* Therefore, in dry_run mode, "get_data()" will release the small
* allocated buffer which is reused to hold temporary zstream output
* and return NULL instead of returning garbage data.
*/
static void *get_data(unsigned long size)
{
git_zstream stream;
void *buf = xmallocz(size);
unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
void *buf = xmallocz(bufsize);
memset(&stream, 0, sizeof(stream));
stream.next_out = buf;
stream.avail_out = size;
stream.avail_out = bufsize;
stream.next_in = fill(1);
stream.avail_in = len;
git_inflate_init(&stream);
@ -125,8 +137,17 @@ static void *get_data(unsigned long size)
}
stream.next_in = fill(1);
stream.avail_in = len;
if (dry_run) {
/* reuse the buffer in dry_run mode */
stream.next_out = buf;
stream.avail_out = bufsize > size - stream.total_out ?
size - stream.total_out :
bufsize;
}
}
git_inflate_end(&stream);
if (dry_run)
FREE_AND_NULL(buf);
return buf;
}
@ -326,10 +347,70 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
{
void *buf = get_data(size);
if (!dry_run && buf)
if (buf)
write_object(nr, type, buf, size);
else
free(buf);
}
struct input_zstream_data {
git_zstream *zstream;
unsigned char buf[8192];
int status;
};
static const void *feed_input_zstream(struct input_stream *in_stream,
unsigned long *readlen)
{
struct input_zstream_data *data = in_stream->data;
git_zstream *zstream = data->zstream;
void *in = fill(1);
if (in_stream->is_finished) {
*readlen = 0;
return NULL;
}
zstream->next_out = data->buf;
zstream->avail_out = sizeof(data->buf);
zstream->next_in = in;
zstream->avail_in = len;
data->status = git_inflate(zstream, 0);
in_stream->is_finished = data->status != Z_OK;
use(len - zstream->avail_in);
*readlen = sizeof(data->buf) - zstream->avail_out;
return data->buf;
}
static void stream_blob(unsigned long size, unsigned nr)
{
git_zstream zstream = { 0 };
struct input_zstream_data data = { 0 };
struct input_stream in_stream = {
.read = feed_input_zstream,
.data = &data,
};
struct obj_info *info = &obj_list[nr];
data.zstream = &zstream;
git_inflate_init(&zstream);
if (stream_loose_object(&in_stream, size, &info->oid))
die(_("failed to write object in stream"));
if (data.status != Z_STREAM_END)
die(_("inflate returned (%d)"), data.status);
git_inflate_end(&zstream);
if (strict) {
struct blob *blob = lookup_blob(the_repository, &info->oid);
if (!blob)
die(_("invalid blob object from stream"));
blob->object.flags |= FLAG_WRITTEN;
}
info->obj = NULL;
}
static int resolve_against_held(unsigned nr, const struct object_id *base,
@ -359,10 +440,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
oidread(&base_oid, fill(the_hash_algo->rawsz));
use(the_hash_algo->rawsz);
delta_data = get_data(delta_size);
if (dry_run || !delta_data) {
free(delta_data);
if (!delta_data)
return;
}
if (has_object_file(&base_oid))
; /* Ok we have this one */
else if (resolve_against_held(nr, &base_oid,
@ -398,10 +477,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
die("offset value out of bound for delta base object");
delta_data = get_data(delta_size);
if (dry_run || !delta_data) {
free(delta_data);
if (!delta_data)
return;
}
lo = 0;
hi = nr;
while (lo < hi) {
@ -468,9 +545,14 @@ static void unpack_one(unsigned nr)
}
switch (type) {
case OBJ_BLOB:
if (!dry_run && size > big_file_threshold) {
stream_blob(size, nr);
return;
}
/* fallthrough */
case OBJ_COMMIT:
case OBJ_TREE:
case OBJ_BLOB:
case OBJ_TAG:
unpack_non_delta_entry(type, size, nr);
return;

View File

@ -1951,6 +1951,96 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
return fd;
}
/**
* Common steps for loose object writers to start writing loose
* objects:
*
* - Create tmpfile for the loose object.
* - Setup zlib stream for compression.
* - Start to feed header to zlib stream.
*
* Returns a "fd", which should later be provided to
* end_loose_object_common().
*/
static int start_loose_object_common(struct strbuf *tmp_file,
const char *filename, unsigned flags,
git_zstream *stream,
unsigned char *buf, size_t buflen,
git_hash_ctx *c,
char *hdr, int hdrlen)
{
int fd;
fd = create_tmpfile(tmp_file, filename);
if (fd < 0) {
if (flags & HASH_SILENT)
return -1;
else if (errno == EACCES)
return error(_("insufficient permission for adding "
"an object to repository database %s"),
get_object_directory());
else
return error_errno(
_("unable to create temporary file"));
}
/* Setup zlib stream for compression */
git_deflate_init(stream, zlib_compression_level);
stream->next_out = buf;
stream->avail_out = buflen;
the_hash_algo->init_fn(c);
/* Start to feed header to zlib stream */
stream->next_in = (unsigned char *)hdr;
stream->avail_in = hdrlen;
while (git_deflate(stream, 0) == Z_OK)
; /* nothing */
the_hash_algo->update_fn(c, hdr, hdrlen);
return fd;
}
/**
* Common steps for the inner git_deflate() loop for writing loose
* objects. Returns what git_deflate() returns.
*/
static int write_loose_object_common(git_hash_ctx *c,
git_zstream *stream, const int flush,
unsigned char *in0, const int fd,
unsigned char *compressed,
const size_t compressed_len)
{
int ret;
ret = git_deflate(stream, flush ? Z_FINISH : 0);
the_hash_algo->update_fn(c, in0, stream->next_in - in0);
if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
die(_("unable to write loose object file"));
stream->next_out = compressed;
stream->avail_out = compressed_len;
return ret;
}
/**
* Common steps for loose object writers to end writing loose objects:
*
* - End the compression of zlib stream.
* - Get the calculated oid to "oid".
*/
static int end_loose_object_common(git_hash_ctx *c, git_zstream *stream,
struct object_id *oid)
{
int ret;
ret = git_deflate_end_gently(stream);
if (ret != Z_OK)
return ret;
the_hash_algo->final_oid_fn(oid, c);
return Z_OK;
}
static int write_loose_object(const struct object_id *oid, char *hdr,
int hdrlen, const void *buf, unsigned long len,
time_t mtime, unsigned flags)
@ -1968,50 +2058,29 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
loose_object_path(the_repository, &filename, oid);
fd = create_tmpfile(&tmp_file, filename.buf);
if (fd < 0) {
if (flags & HASH_SILENT)
return -1;
else if (errno == EACCES)
return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
else
return error_errno(_("unable to create temporary file"));
}
/* Set it up */
git_deflate_init(&stream, zlib_compression_level);
stream.next_out = compressed;
stream.avail_out = sizeof(compressed);
the_hash_algo->init_fn(&c);
/* First header.. */
stream.next_in = (unsigned char *)hdr;
stream.avail_in = hdrlen;
while (git_deflate(&stream, 0) == Z_OK)
; /* nothing */
the_hash_algo->update_fn(&c, hdr, hdrlen);
fd = start_loose_object_common(&tmp_file, filename.buf, flags,
&stream, compressed, sizeof(compressed),
&c, hdr, hdrlen);
if (fd < 0)
return -1;
/* Then the data itself.. */
stream.next_in = (void *)buf;
stream.avail_in = len;
do {
unsigned char *in0 = stream.next_in;
ret = git_deflate(&stream, Z_FINISH);
the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
die(_("unable to write loose object file"));
stream.next_out = compressed;
stream.avail_out = sizeof(compressed);
ret = write_loose_object_common(&c, &stream, 1, in0, fd,
compressed, sizeof(compressed));
} while (ret == Z_OK);
if (ret != Z_STREAM_END)
die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
ret);
ret = git_deflate_end_gently(&stream);
ret = end_loose_object_common(&c, &stream, &parano_oid);
if (ret != Z_OK)
die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
ret);
the_hash_algo->final_oid_fn(&parano_oid, &c);
if (!oideq(oid, &parano_oid))
die(_("confused by unstable object source data for %s"),
oid_to_hex(oid));
@ -2050,6 +2119,110 @@ static int freshen_packed_object(const struct object_id *oid)
return 1;
}
int stream_loose_object(struct input_stream *in_stream, size_t len,
struct object_id *oid)
{
int fd, ret, err = 0, flush = 0;
unsigned char compressed[4096];
git_zstream stream;
git_hash_ctx c;
struct strbuf tmp_file = STRBUF_INIT;
struct strbuf filename = STRBUF_INIT;
int dirlen;
char hdr[MAX_HEADER_LEN];
int hdrlen;
if (batch_fsync_enabled(FSYNC_COMPONENT_LOOSE_OBJECT))
prepare_loose_object_bulk_checkin();
/* Since oid is not determined, save tmp file to odb path. */
strbuf_addf(&filename, "%s/", get_object_directory());
hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
/*
* Common steps for write_loose_object and stream_loose_object to
* start writing loose objects:
*
* - Create tmpfile for the loose object.
* - Setup zlib stream for compression.
* - Start to feed header to zlib stream.
*/
fd = start_loose_object_common(&tmp_file, filename.buf, 0,
&stream, compressed, sizeof(compressed),
&c, hdr, hdrlen);
if (fd < 0) {
err = -1;
goto cleanup;
}
/* Then the data itself.. */
do {
unsigned char *in0 = stream.next_in;
if (!stream.avail_in && !in_stream->is_finished) {
const void *in = in_stream->read(in_stream, &stream.avail_in);
stream.next_in = (void *)in;
in0 = (unsigned char *)in;
/* All data has been read. */
if (in_stream->is_finished)
flush = 1;
}
ret = write_loose_object_common(&c, &stream, flush, in0, fd,
compressed, sizeof(compressed));
/*
* Unlike write_loose_object(), we do not have the entire
* buffer. If we get Z_BUF_ERROR due to too few input bytes,
* then we'll replenish them in the next input_stream->read()
* call when we loop.
*/
} while (ret == Z_OK || ret == Z_BUF_ERROR);
if (stream.total_in != len + hdrlen)
die(_("write stream object %ld != %"PRIuMAX), stream.total_in,
(uintmax_t)len + hdrlen);
/*
* Common steps for write_loose_object and stream_loose_object to
* end writing loose oject:
*
* - End the compression of zlib stream.
* - Get the calculated oid.
*/
if (ret != Z_STREAM_END)
die(_("unable to stream deflate new object (%d)"), ret);
ret = end_loose_object_common(&c, &stream, oid);
if (ret != Z_OK)
die(_("deflateEnd on stream object failed (%d)"), ret);
close_loose_object(fd, tmp_file.buf);
if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
unlink_or_warn(tmp_file.buf);
goto cleanup;
}
loose_object_path(the_repository, &filename, oid);
/* We finally know the object path, and create the missing dir. */
dirlen = directory_size(filename.buf);
if (dirlen) {
struct strbuf dir = STRBUF_INIT;
strbuf_add(&dir, filename.buf, dirlen);
if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
err = error_errno(_("unable to create directory %s"), dir.buf);
strbuf_release(&dir);
goto cleanup;
}
strbuf_release(&dir);
}
err = finalize_object_file(tmp_file.buf, filename.buf);
cleanup:
strbuf_release(&tmp_file);
strbuf_release(&filename);
return err;
}
int write_object_file_flags(const void *buf, unsigned long len,
enum object_type type, struct object_id *oid,
unsigned flags)

View File

@ -46,6 +46,12 @@ struct object_directory {
char *path;
};
struct input_stream {
const void *(*read)(struct input_stream *, unsigned long *len);
void *data;
int is_finished;
};
KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
struct object_directory *, 1, fspathhash, fspatheq)
@ -269,6 +275,8 @@ static inline int write_object_file(const void *buf, unsigned long len,
int write_object_file_literally(const void *buf, unsigned long len,
const char *type, struct object_id *oid,
unsigned flags);
int stream_loose_object(struct input_stream *in_stream, size_t len,
struct object_id *oid);
/*
* Add an object file to the in-memory object store, without writing it

76
t/t5351-unpack-large-objects.sh Executable file
View File

@ -0,0 +1,76 @@
#!/bin/sh
#
# Copyright (c) 2022 Han Xin
#
test_description='git unpack-objects with large objects'
. ./test-lib.sh
prepare_dest () {
test_when_finished "rm -rf dest.git" &&
git init --bare dest.git &&
git -C dest.git config core.bigFileThreshold "$1"
}
test_expect_success "create large objects (1.5 MB) and PACK" '
test-tool genrandom foo 1500000 >big-blob &&
test_commit --append foo big-blob &&
test-tool genrandom bar 1500000 >big-blob &&
test_commit --append bar big-blob &&
PACK=$(echo HEAD | git pack-objects --revs pack) &&
git verify-pack -v pack-$PACK.pack >out &&
sed -n -e "s/^\([0-9a-f][0-9a-f]*\).*\(commit\|tree\|blob\).*/\1/p" \
<out >obj-list
'
test_expect_success 'set memory limitation to 1MB' '
GIT_ALLOC_LIMIT=1m &&
export GIT_ALLOC_LIMIT
'
test_expect_success 'unpack-objects failed under memory limitation' '
prepare_dest 2m &&
test_must_fail git -C dest.git unpack-objects <pack-$PACK.pack 2>err &&
grep "fatal: attempting to allocate" err
'
test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
prepare_dest 2m &&
git -C dest.git unpack-objects -n <pack-$PACK.pack &&
test_stdout_line_count = 0 find dest.git/objects -type f &&
test_dir_is_empty dest.git/objects/pack
'
test_expect_success 'unpack big object in stream' '
prepare_dest 1m &&
git -C dest.git unpack-objects <pack-$PACK.pack &&
test_dir_is_empty dest.git/objects/pack
'
BATCH_CONFIGURATION='-c core.fsync=loose-object -c core.fsyncmethod=batch'
test_expect_success 'unpack big object in stream (core.fsyncmethod=batch)' '
prepare_dest 1m &&
GIT_TRACE2_EVENT="$(pwd)/trace2.txt" \
git -C dest.git $BATCH_CONFIGURATION unpack-objects <pack-$PACK.pack &&
grep fsync/hardware-flush trace2.txt &&
test_dir_is_empty dest.git/objects/pack &&
git -C dest.git cat-file --batch-check="%(objectname)" <obj-list >current &&
cmp obj-list current
'
test_expect_success 'do not unpack existing large objects' '
prepare_dest 1m &&
git -C dest.git index-pack --stdin <pack-$PACK.pack &&
git -C dest.git unpack-objects <pack-$PACK.pack &&
# The destination came up with the exact same pack...
DEST_PACK=$(echo dest.git/objects/pack/pack-*.pack) &&
test_cmp pack-$PACK.pack $DEST_PACK &&
# ...and wrote no loose objects
test_stdout_line_count = 0 find dest.git/objects -type f ! -name "pack-*"
'
test_done