From 88d71f479338c1a70fac15ea37f87315f9401f99 Mon Sep 17 00:00:00 2001
From: Oran Agra <oran@redislabs.com>
Date: Wed, 20 May 2020 14:08:40 +0300
Subject: [PATCH] fix a rare active defrag edge case bug leading to stagnation

There's a rare case which leads to stagnation in the defragger, causing
it to keep scanning the keyspace and do nothing (not moving any
allocation), this happens when all the allocator slabs of a certain bin
have the same % utilization, but the slab from which new allocations are
made have a lower utilization.

this commit fixes it by removing the current slab from the overall
average utilization of the bin, and also eliminate any precision loss in
the utilization calculation and move the decision about the defrag to
reside inside jemalloc.

and also add a test that consistently reproduce this issue.
---
 .../internal/jemalloc_internal_inlines_c.h    |  23 +++-
 deps/jemalloc/src/jemalloc.c                  |   8 +-
 src/defrag.c                                  |  13 +-
 tests/unit/memefficiency.tcl                  | 125 +++++++++++++++++-
 4 files changed, 146 insertions(+), 23 deletions(-)

diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h
index 290e5cf99..2685802b8 100644
--- a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h
+++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h
@@ -216,7 +216,7 @@ ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra,
 }
 
 JEMALLOC_ALWAYS_INLINE int
-iget_defrag_hint(tsdn_t *tsdn, void* ptr, int *bin_util, int *run_util) {
+iget_defrag_hint(tsdn_t *tsdn, void* ptr) {
 	int defrag = 0;
 	rtree_ctx_t rtree_ctx_fallback;
 	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
@@ -232,11 +232,22 @@ iget_defrag_hint(tsdn_t *tsdn, void* ptr, int *bin_util, int *run_util) {
 		malloc_mutex_lock(tsdn, &bin->lock);
 		/* don't bother moving allocations from the slab currently used for new allocations */
 		if (slab != bin->slabcur) {
-			const bin_info_t *bin_info = &bin_infos[binind];
-			size_t availregs = bin_info->nregs * bin->stats.curslabs;
-			*bin_util = ((long long)bin->stats.curregs<<16) / availregs;
-			*run_util = ((long long)(bin_info->nregs - extent_nfree_get(slab))<<16) / bin_info->nregs;
-			defrag = 1;
+			int free_in_slab = extent_nfree_get(slab);
+			if (free_in_slab) {
+				const bin_info_t *bin_info = &bin_infos[binind];
+				int curslabs = bin->stats.curslabs;
+				size_t curregs = bin->stats.curregs;
+				if (bin->slabcur) {
+					/* remove slabcur from the overall utilization */
+					curregs -= bin_info->nregs - extent_nfree_get(bin->slabcur);
+					curslabs -= 1;
+				}
+				/* Compare the utilization ratio of the slab in question to the total average,
+				 * to avoid precision lost and division, we do that by extrapolating the usage
+				 * of the slab as if all slabs have the same usage. If this slab is less used 
+				 * than the average, we'll prefer to evict the data to hopefully more used ones */
+				defrag = (bin_info->nregs - free_in_slab) * curslabs <= curregs;
+			}
 		}
 		malloc_mutex_unlock(tsdn, &bin->lock);
 	}
diff --git a/deps/jemalloc/src/jemalloc.c b/deps/jemalloc/src/jemalloc.c
index 5b936cb48..585645a28 100644
--- a/deps/jemalloc/src/jemalloc.c
+++ b/deps/jemalloc/src/jemalloc.c
@@ -3326,12 +3326,10 @@ jemalloc_postfork_child(void) {
 /******************************************************************************/
 
 /* Helps the application decide if a pointer is worth re-allocating in order to reduce fragmentation.
- * returns 0 if the allocation is in the currently active run,
- * or when it is not causing any frag issue (large or huge bin)
- * returns the bin utilization and run utilization both in fixed point 16:16.
+ * returns 1 if the allocation should be moved, and 0 if the allocation be kept.
  * If the application decides to re-allocate it should use MALLOCX_TCACHE_NONE when doing so. */
 JEMALLOC_EXPORT int JEMALLOC_NOTHROW
-get_defrag_hint(void* ptr, int *bin_util, int *run_util) {
+get_defrag_hint(void* ptr) {
 	assert(ptr != NULL);
-	return iget_defrag_hint(TSDN_NULL, ptr, bin_util, run_util);
+	return iget_defrag_hint(TSDN_NULL, ptr);
 }
diff --git a/src/defrag.c b/src/defrag.c
index e729297a5..6e5296632 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -43,7 +43,7 @@
 
 /* this method was added to jemalloc in order to help us understand which
  * pointers are worthwhile moving and which aren't */
-int je_get_defrag_hint(void* ptr, int *bin_util, int *run_util);
+int je_get_defrag_hint(void* ptr);
 
 /* forward declarations*/
 void defragDictBucketCallback(void *privdata, dictEntry **bucketref);
@@ -55,18 +55,11 @@ dictEntry* replaceSateliteDictKeyPtrAndOrDefragDictEntry(dict *d, sds oldkey, sd
  * when it returns a non-null value, the old pointer was already released
  * and should NOT be accessed. */
 void* activeDefragAlloc(void *ptr) {
-    int bin_util, run_util;
     size_t size;
     void *newptr;
-    if(!je_get_defrag_hint(ptr, &bin_util, &run_util)) {
-        server.stat_active_defrag_misses++;
-        return NULL;
-    }
-    /* if this run is more utilized than the average utilization in this bin
-     * (or it is full), skip it. This will eventually move all the allocations
-     * from relatively empty runs into relatively full runs. */
-    if (run_util > bin_util || run_util == 1<<16) {
+    if(!je_get_defrag_hint(ptr)) {
         server.stat_active_defrag_misses++;
+        size = zmalloc_size(ptr);
         return NULL;
     }
     /* move this allocation to a new allocation.
diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl
index 777693fdf..390bad192 100644
--- a/tests/unit/memefficiency.tcl
+++ b/tests/unit/memefficiency.tcl
@@ -95,6 +95,10 @@ start_server {tags {"defrag"}} {
                 }
                 if {$::verbose} {
                     puts "frag $frag"
+                    set misses [s active_defrag_misses]
+                    set hits [s active_defrag_hits]
+                    puts "hits: $hits"
+                    puts "misses: $misses"
                     puts "max latency $max_latency"
                     puts [r latency latest]
                     puts [r latency history active-defrag-cycle]
@@ -221,6 +225,10 @@ start_server {tags {"defrag"}} {
                 }
                 if {$::verbose} {
                     puts "frag $frag"
+                    set misses [s active_defrag_misses]
+                    set hits [s active_defrag_hits]
+                    puts "hits: $hits"
+                    puts "misses: $misses"
                     puts "max latency $max_latency"
                     puts [r latency latest]
                     puts [r latency history active-defrag-cycle]
@@ -256,11 +264,12 @@ start_server {tags {"defrag"}} {
             set expected_frag 1.7
             # add a mass of list nodes to two lists (allocations are interlaced)
             set val [string repeat A 100] ;# 5 items of 100 bytes puts us in the 640 bytes bin, which has 32 regs, so high potential for fragmentation
-            for {set j 0} {$j < 500000} {incr j} {
+            set elements 500000
+            for {set j 0} {$j < $elements} {incr j} {
                 $rd lpush biglist1 $val
                 $rd lpush biglist2 $val
             }
-            for {set j 0} {$j < 500000} {incr j} {
+            for {set j 0} {$j < $elements} {incr j} {
                 $rd read ; # Discard replies
                 $rd read ; # Discard replies
             }
@@ -302,6 +311,8 @@ start_server {tags {"defrag"}} {
 
                 # test the the fragmentation is lower
                 after 120 ;# serverCron only updates the info once in 100ms
+                set misses [s active_defrag_misses]
+                set hits [s active_defrag_hits]
                 set frag [s allocator_frag_ratio]
                 set max_latency 0
                 foreach event [r latency latest] {
@@ -312,6 +323,8 @@ start_server {tags {"defrag"}} {
                 }
                 if {$::verbose} {
                     puts "frag $frag"
+                    puts "misses: $misses"
+                    puts "hits: $hits"
                     puts "max latency $max_latency"
                     puts [r latency latest]
                     puts [r latency history active-defrag-cycle]
@@ -320,6 +333,10 @@ start_server {tags {"defrag"}} {
                 # due to high fragmentation, 100hz, and active-defrag-cycle-max set to 75,
                 # we expect max latency to be not much higher than 7.5ms but due to rare slowness threshold is set higher
                 assert {$max_latency <= 30}
+
+                # in extreme cases of stagnation, we see over 20m misses before the tests aborts with "defrag didn't stop",
+                # in normal cases we only see 100k misses out of 500k elements
+                assert {$misses < $elements}
             }
             # verify the data isn't corrupted or changed
             set newdigest [r debug digest]
@@ -327,6 +344,110 @@ start_server {tags {"defrag"}} {
             r save ;# saving an rdb iterates over all the data / pointers
             r del biglist1 ;# coverage for quicklistBookmarksClear
         } {1}
+
+        test "Active defrag edge case" {
+            # there was an edge case in defrag where all the slabs of a certain bin are exact the same
+            # % utilization, with the exception of the current slab from which new allocations are made
+            # if the current slab is lower in utilization the defragger would have ended up in stagnation,
+            # keept running and not move any allocation.
+            # this test is more consistent on a fresh server with no history
+            start_server {tags {"defrag"}} {
+                r flushdb
+                r config resetstat
+                r config set save "" ;# prevent bgsave from interfereing with save below
+                r config set hz 100
+                r config set activedefrag no
+                r config set active-defrag-max-scan-fields 1000
+                r config set active-defrag-threshold-lower 5
+                r config set active-defrag-cycle-min 65
+                r config set active-defrag-cycle-max 75
+                r config set active-defrag-ignore-bytes 1mb
+                r config set maxmemory 0
+                set expected_frag 1.3
+
+                r debug mallctl-str thread.tcache.flush VOID
+                # fill the first slab containin 32 regs of 640 bytes.
+                for {set j 0} {$j < 32} {incr j} {
+                    r setrange "_$j" 600 x
+                    r debug mallctl-str thread.tcache.flush VOID
+                }
+
+                # add a mass of keys with 600 bytes values, fill the bin of 640 bytes which has 32 regs per slab.
+                set rd [redis_deferring_client]
+                set keys 640000
+                for {set j 0} {$j < $keys} {incr j} {
+                    $rd setrange $j 600 x
+                }
+                for {set j 0} {$j < $keys} {incr j} {
+                    $rd read ; # Discard replies
+                }
+
+                # create some fragmentation of 50%
+                set sent 0
+                for {set j 0} {$j < $keys} {incr j 1} {
+                    $rd del $j
+                    incr sent
+                    incr j 1
+                }
+                for {set j 0} {$j < $sent} {incr j} {
+                    $rd read ; # Discard replies
+                }
+
+                # create higher fragmentation in the first slab
+                for {set j 10} {$j < 32} {incr j} {
+                    r del "_$j"
+                }
+
+                # start defrag
+                after 120 ;# serverCron only updates the info once in 100ms
+                set frag [s allocator_frag_ratio]
+                if {$::verbose} {
+                    puts "frag $frag"
+                }
+
+                assert {$frag >= $expected_frag}
+
+                set digest [r debug digest]
+                catch {r config set activedefrag yes} e
+                if {![string match {DISABLED*} $e]} {
+                    # wait for the active defrag to start working (decision once a second)
+                    wait_for_condition 50 100 {
+                        [s active_defrag_running] ne 0
+                    } else {
+                        fail "defrag not started."
+                    }
+
+                    # wait for the active defrag to stop working
+                    wait_for_condition 500 100 {
+                        [s active_defrag_running] eq 0
+                    } else {
+                        after 120 ;# serverCron only updates the info once in 100ms
+                        puts [r info memory]
+                        puts [r info stats]
+                        puts [r memory malloc-stats]
+                        fail "defrag didn't stop."
+                    }
+
+                    # test the the fragmentation is lower
+                    after 120 ;# serverCron only updates the info once in 100ms
+                    set misses [s active_defrag_misses]
+                    set hits [s active_defrag_hits]
+                    set frag [s allocator_frag_ratio]
+                    if {$::verbose} {
+                        puts "frag $frag"
+                        puts "hits: $hits"
+                        puts "misses: $misses"
+                    }
+                    assert {$frag < 1.1}
+                    assert {$misses < 10000000} ;# when defrag doesn't stop, we have some 30m misses, when it does, we have 2m misses
+                }
+
+                # verify the data isn't corrupted or changed
+                set newdigest [r debug digest]
+                assert {$digest eq $newdigest}
+                r save ;# saving an rdb iterates over all the data / pointers
+            }
+        }
     }
 }
 } ;# run_solo