Prevent lua error_reply abuse from causing errorstats to become larger (#13141)

Users who abuse lua error_reply will generate a new error object on each error call, which can make server.errors get bigger and bigger. This will cause the server to block when calling INFO (we also return errorstats by default). To prevent the damage it can cause, when a misuse is detected, we will print a warning log and disable the errorstats to avoid adding more new errors. It can be re-enabled via CONFIG RESETSTAT. Because server.errors may be very large (it may be better now since we have the limit), config resetstat may block for a while. So in resetErrorTableStats, we will try to lazyfree server.errors. See the related discussion at the end of #8217.
2024-03-19 14:18:22 +08:00 · 2024-03-19 14:18:22 +08:00 · e04d41d78d
parent aeada20140
commit e04d41d78d
4 changed files with 87 additions and 1 deletions
--- a/src/lazyfree.c
+++ b/src/lazyfree.c
@ -39,6 +39,15 @@ void lazyFreeTrackingTable(void *args[]) {
    atomicIncr(lazyfreed_objects,len);
 }

+/* Release the error stats rax tree. */
+void lazyFreeErrors(void *args[]) {
+    rax *errors = args[0];
+    size_t len = errors->numele;
+    raxFreeWithCallback(errors, zfree);
+    atomicDecr(lazyfree_objects,len);
+    atomicIncr(lazyfreed_objects,len);
+}
+
 /* Release the lua_scripts dict. */
 void lazyFreeLuaScripts(void *args[]) {
    dict *lua_scripts = args[0];
@ -202,6 +211,18 @@ void freeTrackingRadixTreeAsync(rax *tracking) {
    }
 }

+/* Free the error stats rax tree.
+ * If the rax tree is huge enough, free it in async way. */
+void freeErrorsRadixTreeAsync(rax *errors) {
+    /* Because this rax has only keys and no values so we use numnodes. */
+    if (errors->numnodes > LAZYFREE_THRESHOLD) {
+        atomicIncr(lazyfree_objects,errors->numele);
+        bioCreateLazyFreeJob(lazyFreeErrors,1,errors);
+    } else {
+        raxFreeWithCallback(errors, zfree);
+    }
+}
+
 /* Free lua_scripts dict and lru list, if the dict is huge enough, free them in async way.
 * Close lua interpreter, if there are a lot of lua scripts, close it in async way. */
 void freeLuaScriptsAsync(dict *lua_scripts, list *lua_scripts_lru_list, lua_State *lua) {
--- a/src/server.c
+++ b/src/server.c
@ -2605,6 +2605,7 @@ void initServer(void) {
    server.main_thread_id = pthread_self();
    server.current_client = NULL;
    server.errors = raxNew();
+    server.errors_enabled = 1;
    server.execution_nesting = 0;
    server.clients = listCreate();
    server.clients_index = raxNew();
@ -3109,8 +3110,9 @@ void resetCommandTableStats(dict* commands) {
 }

 void resetErrorTableStats(void) {
-    raxFreeWithCallback(server.errors, zfree);
+    freeErrorsRadixTreeAsync(server.errors);
    server.errors = raxNew();
+    server.errors_enabled = 1;
 }

 /* ========================== Redis OP Array API ============================ */
@ -4201,9 +4203,49 @@ int processCommand(client *c) {

 /* ====================== Error lookup and execution ===================== */

+/* Users who abuse lua error_reply will generate a new error object on each
+ * error call, which can make server.errors get bigger and bigger. This will
+ * cause the server to block when calling INFO (we also return errorstats by
+ * default). To prevent the damage it can cause, when a misuse is detected,
+ * we will print the warning log and disable the errorstats to avoid adding
+ * more new errors. It can be re-enabled via CONFIG RESETSTAT. */
+#define ERROR_STATS_NUMBER 128
 void incrementErrorCount(const char *fullerr, size_t namelen) {
+    /* errorstats is disabled, return ASAP. */
+    if (!server.errors_enabled) return;
+
    void *result;
    if (!raxFind(server.errors,(unsigned char*)fullerr,namelen,&result)) {
+        if (server.errors->numele >= ERROR_STATS_NUMBER) {
+            sds errors = sdsempty();
+            raxIterator ri;
+            raxStart(&ri, server.errors);
+            raxSeek(&ri, "^", NULL, 0);
+            while (raxNext(&ri)) {
+                char *tmpsafe;
+                errors = sdscatlen(errors, getSafeInfoString((char *)ri.key, ri.key_len, &tmpsafe), ri.key_len);
+                errors = sdscatlen(errors, ", ", 2);
+                if (tmpsafe != NULL) zfree(tmpsafe);
+            }
+            sdsrange(errors, 0, -3); /* Remove final ", ". */
+            raxStop(&ri);
+
+            /* Print the warning log and the contents of server.errors to the log. */
+            serverLog(LL_WARNING,
+                      "Errorstats stopped adding new errors because the number of "
+                      "errors reached the limit, may be misuse of lua error_reply, "
+                      "please check INFO ERRORSTATS, this can be re-enabled via "
+                      "CONFIG RESETSTAT.");
+            serverLog(LL_WARNING, "Current errors code list: %s", errors);
+            sdsfree(errors);
+
+            /* Reset the errors and add a single element to indicate that it is disabled. */
+            resetErrorTableStats();
+            incrementErrorCount("ERRORSTATS_DISABLED", 19);
+            server.errors_enabled = 0;
+            return;
+        }
+
        struct redisError *error = zmalloc(sizeof(*error));
        error->count = 1;
        raxInsert(server.errors,(unsigned char*)fullerr,namelen,error,NULL);
--- a/src/server.h
+++ b/src/server.h
@ -1563,6 +1563,7 @@ struct redisServer {
    dict *orig_commands;        /* Command table before command renaming. */
    aeEventLoop *el;
    rax *errors;                /* Errors table */
+    int errors_enabled;         /* If true, errorstats is enabled, and we will add new errors. */
    unsigned int lruclock; /* Clock for LRU eviction */
    volatile sig_atomic_t shutdown_asap; /* Shutdown ordered by signal handler. */
    mstime_t shutdown_mstime;   /* Timestamp to limit graceful shutdown. */
@ -2703,6 +2704,7 @@ void trackingHandlePendingKeyInvalidations(void);
 void trackingInvalidateKeysOnFlush(int async);
 void freeTrackingRadixTree(rax *rt);
 void freeTrackingRadixTreeAsync(rax *rt);
+void freeErrorsRadixTreeAsync(rax *errors);
 void trackingLimitUsedSlots(void);
 uint64_t trackingGetTotalItems(void);
 uint64_t trackingGetTotalKeys(void);
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@ -274,6 +274,27 @@ start_server {tags {"info" "external:skip"}} {
            $rd close
        }

+        test {errorstats: limit errors will not increase indefinitely} {
+            r config resetstat
+            for {set j 1} {$j <= 1100} {incr j} {
+                assert_error "$j my error message" {
+                    r eval {return redis.error_reply(string.format('%s my error message', ARGV[1]))} 0 $j
+                }
+            }
+
+            assert_equal [count_log_message 0 "Errorstats stopped adding new errors"] 1
+            assert_equal [count_log_message 0 "Current errors code list"] 1
+            assert_equal "count=1" [errorstat ERRORSTATS_DISABLED]
+
+            # Since we currently have no metrics exposed for server.errors, we use lazyfree
+            # to verify that we only have 128 errors.
+            wait_for_condition 50 100 {
+                [s lazyfreed_objects] eq 128
+            } else {
+                fail "errorstats resetstat lazyfree error"
+            }
+        }
+
        test {stats: eventloop metrics} {
            set info1 [r info stats]
            set cycle1 [getInfoProperty $info1 eventloop_cycles]