midx: implement midx_repack()

To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.

If the batch size is zero, select all packs in the multi-pack-index.

Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.

Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.

While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
Derrick Stolee 2019-06-10 16:35:27 -07:00 committed by Junio C Hamano
parent 2af890bb28
commit ce1e4a105b
2 changed files with 178 additions and 1 deletions

151
midx.c
View File

@ -9,6 +9,7 @@
#include "midx.h"
#include "progress.h"
#include "trace2.h"
#include "run-command.h"
#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
#define MIDX_VERSION 1
@ -1227,7 +1228,155 @@ int expire_midx_packs(struct repository *r, const char *object_dir)
return result;
}
int midx_repack(struct repository *r, const char *object_dir, size_t batch_size)
struct repack_info {
timestamp_t mtime;
uint32_t referenced_objects;
uint32_t pack_int_id;
};
static int compare_by_mtime(const void *a_, const void *b_)
{
const struct repack_info *a, *b;
a = (const struct repack_info *)a_;
b = (const struct repack_info *)b_;
if (a->mtime < b->mtime)
return -1;
if (a->mtime > b->mtime)
return 1;
return 0;
}
static int fill_included_packs_all(struct multi_pack_index *m,
unsigned char *include_pack)
{
uint32_t i;
for (i = 0; i < m->num_packs; i++)
include_pack[i] = 1;
return m->num_packs < 2;
}
static int fill_included_packs_batch(struct repository *r,
struct multi_pack_index *m,
unsigned char *include_pack,
size_t batch_size)
{
uint32_t i, packs_to_repack;
size_t total_size;
struct repack_info *pack_info = xcalloc(m->num_packs, sizeof(struct repack_info));
for (i = 0; i < m->num_packs; i++) {
pack_info[i].pack_int_id = i;
if (prepare_midx_pack(r, m, i))
continue;
pack_info[i].mtime = m->packs[i]->mtime;
}
for (i = 0; batch_size && i < m->num_objects; i++) {
uint32_t pack_int_id = nth_midxed_pack_int_id(m, i);
pack_info[pack_int_id].referenced_objects++;
}
QSORT(pack_info, m->num_packs, compare_by_mtime);
total_size = 0;
packs_to_repack = 0;
for (i = 0; total_size < batch_size && i < m->num_packs; i++) {
int pack_int_id = pack_info[i].pack_int_id;
struct packed_git *p = m->packs[pack_int_id];
size_t expected_size;
if (!p)
continue;
if (open_pack_index(p) || !p->num_objects)
continue;
expected_size = (size_t)(p->pack_size
* pack_info[i].referenced_objects);
expected_size /= p->num_objects;
if (expected_size >= batch_size)
continue;
packs_to_repack++;
total_size += expected_size;
include_pack[pack_int_id] = 1;
}
free(pack_info);
if (total_size < batch_size || packs_to_repack < 2)
return 1;
return 0;
}
int midx_repack(struct repository *r, const char *object_dir, size_t batch_size)
{
int result = 0;
uint32_t i;
unsigned char *include_pack;
struct child_process cmd = CHILD_PROCESS_INIT;
struct strbuf base_name = STRBUF_INIT;
struct multi_pack_index *m = load_multi_pack_index(object_dir, 1);
if (!m)
return 0;
include_pack = xcalloc(m->num_packs, sizeof(unsigned char));
if (batch_size) {
if (fill_included_packs_batch(r, m, include_pack, batch_size))
goto cleanup;
} else if (fill_included_packs_all(m, include_pack))
goto cleanup;
argv_array_push(&cmd.args, "pack-objects");
strbuf_addstr(&base_name, object_dir);
strbuf_addstr(&base_name, "/pack/pack");
argv_array_push(&cmd.args, base_name.buf);
strbuf_release(&base_name);
cmd.git_cmd = 1;
cmd.in = cmd.out = -1;
if (start_command(&cmd)) {
error(_("could not start pack-objects"));
result = 1;
goto cleanup;
}
for (i = 0; i < m->num_objects; i++) {
struct object_id oid;
uint32_t pack_int_id = nth_midxed_pack_int_id(m, i);
if (!include_pack[pack_int_id])
continue;
nth_midxed_object_oid(&oid, m, i);
xwrite(cmd.in, oid_to_hex(&oid), the_hash_algo->hexsz);
xwrite(cmd.in, "\n", 1);
}
close(cmd.in);
if (finish_command(&cmd)) {
error(_("could not finish pack-objects"));
result = 1;
goto cleanup;
}
result = write_midx_internal(object_dir, m, NULL);
m = NULL;
cleanup:
if (m)
close_midx(m);
free(include_pack);
return result;
}

View File

@ -450,4 +450,32 @@ test_expect_success 'repack with minimum size does not alter existing packs' '
)
'
test_expect_success 'repack creates a new pack' '
(
cd dup &&
ls .git/objects/pack/*idx >idx-list &&
test_line_count = 5 idx-list &&
THIRD_SMALLEST_SIZE=$(ls -l .git/objects/pack/*pack | awk "{print \$5;}" | sort -n | head -n 3 | tail -n 1) &&
BATCH_SIZE=$(($THIRD_SMALLEST_SIZE + 1)) &&
git multi-pack-index repack --batch-size=$BATCH_SIZE &&
ls .git/objects/pack/*idx >idx-list &&
test_line_count = 6 idx-list &&
test-tool read-midx .git/objects | grep idx >midx-list &&
test_line_count = 6 midx-list
)
'
test_expect_success 'expire removes repacked packs' '
(
cd dup &&
ls -al .git/objects/pack/*pack &&
ls -S .git/objects/pack/*pack | head -n 4 >expect &&
git multi-pack-index expire &&
ls -S .git/objects/pack/*pack >actual &&
test_cmp expect actual &&
test-tool read-midx .git/objects | grep idx >midx-list &&
test_line_count = 4 midx-list
)
'
test_done