Add new DEBUG dict-resizing command to disable the dict resize (#13043)

The test fails here and there: ``` *** [err]: expire scan should skip dictionaries with lot's of empty buckets in tests/unit/expire.tcl scan didn't handle slot skipping logic. ``` There are two case: 1. In the case of passing the test, we use child process to avoid the dict resize, but it can not completely limit it, since in the dictDelete we still have chance to trigger the resize (hit the force radio). The reason why our test passed before is because the expire dict is still in the rehashing process, so the dictDelete, the dictShrinkIfNeeded can not trigger the resize. 2. In the case of failing the test, the expire dict finished the rehashing, so the last dictDelete, the dictShrinkIfNeeded trigger the dict resize since it hit the force radio, so the skipping logic fail. This PR add a new DEBUG command to disbale the dict resize.
2024-02-08 22:39:58 +08:00 · 2024-02-08 22:39:58 +08:00 · 493e31e3ad
parent 813327b231
commit 493e31e3ad
5 changed files with 24 additions and 30 deletions
--- a/src/debug.c
+++ b/src/debug.c
@ -496,6 +496,8 @@ void debugCommand(client *c) {
 "    In case RESET is provided the peak reset time will be restored to the default value",
 "REPLYBUFFER RESIZING <0|1>",
 "    Enable or disable the reply buffer resize cron job",
+"DICT-RESIZING <0|1>",
+"    Enable or disable the main dict and expire dict resizing.",
 NULL
        };
        addExtendedReplyHelp(c, help, clusterDebugCommandExtendedHelp());
@ -1021,6 +1023,9 @@ NULL
            return;
        }
        addReply(c, shared.ok);
+    } else if (!strcasecmp(c->argv[1]->ptr, "dict-resizing") && c->argc == 3) {
+        server.dict_resizing = atoi(c->argv[2]->ptr);
+        addReply(c, shared.ok);
    } else if(!handleDebugClusterCommand(c)) {
        addReplySubcommandSyntaxError(c);
        return;
--- a/src/server.c
+++ b/src/server.c
@ -429,6 +429,9 @@ uint64_t dictEncObjHash(const void *key) {
 * but to guarantee the performance of redis, we still allow dict to expand
 * if dict load factor exceeds HASHTABLE_MAX_LOAD_FACTOR. */
 int dictResizeAllowed(size_t moreMem, double usedRatio) {
+    /* for debug purposes: dict is not allowed to be resized. */
+    if (!server.dict_resizing) return 0;
+
    if (usedRatio <= HASHTABLE_MAX_LOAD_FACTOR) {
        return !overMaxmemoryAfterAlloc(moreMem);
    } else {
@ -2079,6 +2082,7 @@ void initServerConfig(void) {
    server.next_client_id = 1; /* Client IDs, start from 1 .*/
    server.page_size = sysconf(_SC_PAGESIZE);
    server.pause_cron = 0;
+    server.dict_resizing = 1;

    server.latency_tracking_info_percentiles_len = 3;
    server.latency_tracking_info_percentiles = zmalloc(sizeof(double)*(server.latency_tracking_info_percentiles_len));
--- a/src/server.h
+++ b/src/server.h
@ -1754,6 +1754,7 @@ struct redisServer {
    char *proc_title_template;      /* Process title template format */
    clientBufferLimitsConfig client_obuf_limits[CLIENT_TYPE_OBUF_COUNT];
    int pause_cron;                 /* Don't run cron tasks (debug) */
+    int dict_resizing;              /* Whether to allow main dict and expired dict to be resized (debug) */
    int latency_tracking_enabled;   /* 1 if extended latency tracking is enabled, 0 otherwise. */
    double *latency_tracking_info_percentiles; /* Extended latency tracking info output percentile list configuration. */
    int latency_tracking_info_percentiles_len;
--- a/tests/unit/expire.tcl
+++ b/tests/unit/expire.tcl
@ -853,9 +853,9 @@ start_cluster 1 0 {tags {"expire external:skip cluster slow"}} {
        # hashslot(key) is 12539
        r psetex key 500 val

-        # disable resizing
-        r config set rdb-key-save-delay 10000000
-        r bgsave
+        # disable resizing, the reason for not using slow bgsave is because
+        # it will hit the dict_force_resize_ratio.
+        r debug dict-resizing 0

        # delete data to have lot's (99%) of empty buckets (slot 12182 should be skipped)
        for {set j 1} {$j <= 99} {incr j} {
@ -872,20 +872,16 @@ start_cluster 1 0 {tags {"expire external:skip cluster slow"}} {
            [r dbsize] eq 1
        } else {
            if {[r dbsize] eq 0} {
+                puts [r debug htstats 0]
                fail "scan didn't handle slot skipping logic."
            } else {
+                puts [r debug htstats 0]
                fail "scan didn't process all valid slots."
            }
        }

        # Enable resizing
-        r config set rdb-key-save-delay 0
-        catch {exec kill -9 [get_child_pid 0]}
-        wait_for_condition 1000 10 {
-            [s rdb_bgsave_in_progress] eq 0
-        } else {
-            fail "bgsave did not stop in time."
-        }
+        r debug dict-resizing 1

        # put some data into slot 12182 and trigger the resize
        r psetex "{foo}0" 500 a
--- a/tests/unit/other.tcl
+++ b/tests/unit/other.tcl
@ -438,9 +438,9 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} {
        }
        assert_match "*table size: 128*" [r debug HTSTATS 0]

-        # disable resizing
-        r config set rdb-key-save-delay 10000000
-        r bgsave
+        # disable resizing, the reason for not using slow bgsave is because
+        # it will hit the dict_force_resize_ratio.
+        r debug dict-resizing 0

        # delete data to have lot's (96%) of empty buckets
        for {set j 1} {$j <= 123} {incr j} {
@ -449,13 +449,7 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} {
        assert_match "*table size: 128*" [r debug HTSTATS 0]

        # enable resizing
-        r config set rdb-key-save-delay 0
-        catch {exec kill -9 [get_child_pid 0]}
-        wait_for_condition 1000 10 {
-            [s rdb_bgsave_in_progress] eq 0
-        } else {
-            fail "bgsave did not stop in time."
-        }
+        r debug dict-resizing 1

        # waiting for serverCron to resize the tables
        wait_for_condition 1000 10 {
@ -474,22 +468,16 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} {
            r set "{alice}$j" a
        }

-        # disable resizing
-        r config set rdb-key-save-delay 10000000
-        r bgsave
+        # disable resizing, the reason for not using slow bgsave is because
+        # it will hit the dict_force_resize_ratio.
+        r debug dict-resizing 0

        for {set j 1} {$j <= 123} {incr j} {
            r del "{alice}$j"
        }

        # enable resizing
-        r config set rdb-key-save-delay 0
-        catch {exec kill -9 [get_child_pid 0]}
-        wait_for_condition 1000 10 {
-            [s rdb_bgsave_in_progress] eq 0
-        } else {
-            fail "bgsave did not stop in time."
-        }
+        r debug dict-resizing 1

        # waiting for serverCron to resize the tables
        wait_for_condition 1000 10 {