diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h index 290e5cf99..2685802b8 100644 --- a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h +++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h @@ -216,7 +216,7 @@ ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra, } JEMALLOC_ALWAYS_INLINE int -iget_defrag_hint(tsdn_t *tsdn, void* ptr, int *bin_util, int *run_util) { +iget_defrag_hint(tsdn_t *tsdn, void* ptr) { int defrag = 0; rtree_ctx_t rtree_ctx_fallback; rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback); @@ -232,11 +232,22 @@ iget_defrag_hint(tsdn_t *tsdn, void* ptr, int *bin_util, int *run_util) { malloc_mutex_lock(tsdn, &bin->lock); /* don't bother moving allocations from the slab currently used for new allocations */ if (slab != bin->slabcur) { - const bin_info_t *bin_info = &bin_infos[binind]; - size_t availregs = bin_info->nregs * bin->stats.curslabs; - *bin_util = ((long long)bin->stats.curregs<<16) / availregs; - *run_util = ((long long)(bin_info->nregs - extent_nfree_get(slab))<<16) / bin_info->nregs; - defrag = 1; + int free_in_slab = extent_nfree_get(slab); + if (free_in_slab) { + const bin_info_t *bin_info = &bin_infos[binind]; + int curslabs = bin->stats.curslabs; + size_t curregs = bin->stats.curregs; + if (bin->slabcur) { + /* remove slabcur from the overall utilization */ + curregs -= bin_info->nregs - extent_nfree_get(bin->slabcur); + curslabs -= 1; + } + /* Compare the utilization ratio of the slab in question to the total average, + * to avoid precision lost and division, we do that by extrapolating the usage + * of the slab as if all slabs have the same usage. If this slab is less used + * than the average, we'll prefer to evict the data to hopefully more used ones */ + defrag = (bin_info->nregs - free_in_slab) * curslabs <= curregs; + } } malloc_mutex_unlock(tsdn, &bin->lock); } diff --git a/deps/jemalloc/src/jemalloc.c b/deps/jemalloc/src/jemalloc.c index 5b936cb48..585645a28 100644 --- a/deps/jemalloc/src/jemalloc.c +++ b/deps/jemalloc/src/jemalloc.c @@ -3326,12 +3326,10 @@ jemalloc_postfork_child(void) { /******************************************************************************/ /* Helps the application decide if a pointer is worth re-allocating in order to reduce fragmentation. - * returns 0 if the allocation is in the currently active run, - * or when it is not causing any frag issue (large or huge bin) - * returns the bin utilization and run utilization both in fixed point 16:16. + * returns 1 if the allocation should be moved, and 0 if the allocation be kept. * If the application decides to re-allocate it should use MALLOCX_TCACHE_NONE when doing so. */ JEMALLOC_EXPORT int JEMALLOC_NOTHROW -get_defrag_hint(void* ptr, int *bin_util, int *run_util) { +get_defrag_hint(void* ptr) { assert(ptr != NULL); - return iget_defrag_hint(TSDN_NULL, ptr, bin_util, run_util); + return iget_defrag_hint(TSDN_NULL, ptr); } diff --git a/src/defrag.c b/src/defrag.c index e729297a5..6e5296632 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -43,7 +43,7 @@ /* this method was added to jemalloc in order to help us understand which * pointers are worthwhile moving and which aren't */ -int je_get_defrag_hint(void* ptr, int *bin_util, int *run_util); +int je_get_defrag_hint(void* ptr); /* forward declarations*/ void defragDictBucketCallback(void *privdata, dictEntry **bucketref); @@ -55,18 +55,11 @@ dictEntry* replaceSateliteDictKeyPtrAndOrDefragDictEntry(dict *d, sds oldkey, sd * when it returns a non-null value, the old pointer was already released * and should NOT be accessed. */ void* activeDefragAlloc(void *ptr) { - int bin_util, run_util; size_t size; void *newptr; - if(!je_get_defrag_hint(ptr, &bin_util, &run_util)) { - server.stat_active_defrag_misses++; - return NULL; - } - /* if this run is more utilized than the average utilization in this bin - * (or it is full), skip it. This will eventually move all the allocations - * from relatively empty runs into relatively full runs. */ - if (run_util > bin_util || run_util == 1<<16) { + if(!je_get_defrag_hint(ptr)) { server.stat_active_defrag_misses++; + size = zmalloc_size(ptr); return NULL; } /* move this allocation to a new allocation. diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index 777693fdf..390bad192 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -95,6 +95,10 @@ start_server {tags {"defrag"}} { } if {$::verbose} { puts "frag $frag" + set misses [s active_defrag_misses] + set hits [s active_defrag_hits] + puts "hits: $hits" + puts "misses: $misses" puts "max latency $max_latency" puts [r latency latest] puts [r latency history active-defrag-cycle] @@ -221,6 +225,10 @@ start_server {tags {"defrag"}} { } if {$::verbose} { puts "frag $frag" + set misses [s active_defrag_misses] + set hits [s active_defrag_hits] + puts "hits: $hits" + puts "misses: $misses" puts "max latency $max_latency" puts [r latency latest] puts [r latency history active-defrag-cycle] @@ -256,11 +264,12 @@ start_server {tags {"defrag"}} { set expected_frag 1.7 # add a mass of list nodes to two lists (allocations are interlaced) set val [string repeat A 100] ;# 5 items of 100 bytes puts us in the 640 bytes bin, which has 32 regs, so high potential for fragmentation - for {set j 0} {$j < 500000} {incr j} { + set elements 500000 + for {set j 0} {$j < $elements} {incr j} { $rd lpush biglist1 $val $rd lpush biglist2 $val } - for {set j 0} {$j < 500000} {incr j} { + for {set j 0} {$j < $elements} {incr j} { $rd read ; # Discard replies $rd read ; # Discard replies } @@ -302,6 +311,8 @@ start_server {tags {"defrag"}} { # test the the fragmentation is lower after 120 ;# serverCron only updates the info once in 100ms + set misses [s active_defrag_misses] + set hits [s active_defrag_hits] set frag [s allocator_frag_ratio] set max_latency 0 foreach event [r latency latest] { @@ -312,6 +323,8 @@ start_server {tags {"defrag"}} { } if {$::verbose} { puts "frag $frag" + puts "misses: $misses" + puts "hits: $hits" puts "max latency $max_latency" puts [r latency latest] puts [r latency history active-defrag-cycle] @@ -320,6 +333,10 @@ start_server {tags {"defrag"}} { # due to high fragmentation, 100hz, and active-defrag-cycle-max set to 75, # we expect max latency to be not much higher than 7.5ms but due to rare slowness threshold is set higher assert {$max_latency <= 30} + + # in extreme cases of stagnation, we see over 20m misses before the tests aborts with "defrag didn't stop", + # in normal cases we only see 100k misses out of 500k elements + assert {$misses < $elements} } # verify the data isn't corrupted or changed set newdigest [r debug digest] @@ -327,6 +344,110 @@ start_server {tags {"defrag"}} { r save ;# saving an rdb iterates over all the data / pointers r del biglist1 ;# coverage for quicklistBookmarksClear } {1} + + test "Active defrag edge case" { + # there was an edge case in defrag where all the slabs of a certain bin are exact the same + # % utilization, with the exception of the current slab from which new allocations are made + # if the current slab is lower in utilization the defragger would have ended up in stagnation, + # keept running and not move any allocation. + # this test is more consistent on a fresh server with no history + start_server {tags {"defrag"}} { + r flushdb + r config resetstat + r config set save "" ;# prevent bgsave from interfereing with save below + r config set hz 100 + r config set activedefrag no + r config set active-defrag-max-scan-fields 1000 + r config set active-defrag-threshold-lower 5 + r config set active-defrag-cycle-min 65 + r config set active-defrag-cycle-max 75 + r config set active-defrag-ignore-bytes 1mb + r config set maxmemory 0 + set expected_frag 1.3 + + r debug mallctl-str thread.tcache.flush VOID + # fill the first slab containin 32 regs of 640 bytes. + for {set j 0} {$j < 32} {incr j} { + r setrange "_$j" 600 x + r debug mallctl-str thread.tcache.flush VOID + } + + # add a mass of keys with 600 bytes values, fill the bin of 640 bytes which has 32 regs per slab. + set rd [redis_deferring_client] + set keys 640000 + for {set j 0} {$j < $keys} {incr j} { + $rd setrange $j 600 x + } + for {set j 0} {$j < $keys} {incr j} { + $rd read ; # Discard replies + } + + # create some fragmentation of 50% + set sent 0 + for {set j 0} {$j < $keys} {incr j 1} { + $rd del $j + incr sent + incr j 1 + } + for {set j 0} {$j < $sent} {incr j} { + $rd read ; # Discard replies + } + + # create higher fragmentation in the first slab + for {set j 10} {$j < 32} {incr j} { + r del "_$j" + } + + # start defrag + after 120 ;# serverCron only updates the info once in 100ms + set frag [s allocator_frag_ratio] + if {$::verbose} { + puts "frag $frag" + } + + assert {$frag >= $expected_frag} + + set digest [r debug digest] + catch {r config set activedefrag yes} e + if {![string match {DISABLED*} $e]} { + # wait for the active defrag to start working (decision once a second) + wait_for_condition 50 100 { + [s active_defrag_running] ne 0 + } else { + fail "defrag not started." + } + + # wait for the active defrag to stop working + wait_for_condition 500 100 { + [s active_defrag_running] eq 0 + } else { + after 120 ;# serverCron only updates the info once in 100ms + puts [r info memory] + puts [r info stats] + puts [r memory malloc-stats] + fail "defrag didn't stop." + } + + # test the the fragmentation is lower + after 120 ;# serverCron only updates the info once in 100ms + set misses [s active_defrag_misses] + set hits [s active_defrag_hits] + set frag [s allocator_frag_ratio] + if {$::verbose} { + puts "frag $frag" + puts "hits: $hits" + puts "misses: $misses" + } + assert {$frag < 1.1} + assert {$misses < 10000000} ;# when defrag doesn't stop, we have some 30m misses, when it does, we have 2m misses + } + + # verify the data isn't corrupted or changed + set newdigest [r debug digest] + assert {$digest eq $newdigest} + r save ;# saving an rdb iterates over all the data / pointers + } + } } } } ;# run_solo