Handle recursive serverAsserts and provide more information for recursive segfaults (#12857)

This change is trying to make two failure modes a bit easier to deep dive:
1. If a serverPanic or serverAssert occurs during the info (or module)
printing, it will recursively panic, which is a lot of fun as it will
just keep recursively printing. It will eventually stack overflow, but
will generate a lot of text in the process.
2. When a segfault happens during the segfault handler, no information
is communicated other than it happened. This can be problematic because
`info` may help diagnose the real issue, but without fixing the
recursive crash it might be hard to get at that info.
This commit is contained in:
Madelyn Olson 2024-01-02 18:20:22 -08:00 committed by GitHub
parent c3f8b542ee
commit 068051e378
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 138 additions and 27 deletions

View File

@ -55,4 +55,5 @@ $TCLSH tests/test_helper.tcl \
--single unit/moduleapi/async_rm_call \
--single unit/moduleapi/moduleauth \
--single unit/moduleapi/rdbloadsave \
--single unit/moduleapi/crash \
"${@}"

View File

@ -72,10 +72,10 @@ static pthread_mutex_t signal_handler_lock;
static pthread_mutexattr_t signal_handler_lock_attr;
static volatile int signal_handler_lock_initialized = 0;
/* Forward declarations */
void bugReportStart(void);
int bugReportStart(void);
void printCrashReport(void);
void bugReportEnd(int killViaSignal, int sig);
void logStackTrace(void *eip, int uplevel);
void logStackTrace(void *eip, int uplevel, int current_thread);
void dbGetStats(char *buf, size_t bufsize, redisDb *db, int full, dbKeyType keyType);
void sigalrmSignalHandler(int sig, siginfo_t *info, void *secret);
@ -1031,15 +1031,17 @@ NULL
__attribute__ ((noinline))
void _serverAssert(const char *estr, const char *file, int line) {
bugReportStart();
serverLog(LL_WARNING,"=== ASSERTION FAILED ===");
int new_report = bugReportStart();
serverLog(LL_WARNING,"=== %sASSERTION FAILED ===", new_report ? "" : "RECURSIVE ");
serverLog(LL_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
if (server.crashlog_enabled) {
#ifdef HAVE_BACKTRACE
logStackTrace(NULL, 1);
logStackTrace(NULL, 1, 0);
#endif
printCrashReport();
/* If this was a recursive assertion, it what most likely generated
* from printCrashReport. */
if (new_report) printCrashReport();
}
// remove the signal handler so on abort() we will output the crash report.
@ -1127,16 +1129,18 @@ void _serverPanic(const char *file, int line, const char *msg, ...) {
vsnprintf(fmtmsg,sizeof(fmtmsg),msg,ap);
va_end(ap);
bugReportStart();
int new_report = bugReportStart();
serverLog(LL_WARNING,"------------------------------------------------");
serverLog(LL_WARNING,"!!! Software Failure. Press left mouse button to continue");
serverLog(LL_WARNING,"Guru Meditation: %s #%s:%d",fmtmsg,file,line);
if (server.crashlog_enabled) {
#ifdef HAVE_BACKTRACE
logStackTrace(NULL, 1);
logStackTrace(NULL, 1, 0);
#endif
printCrashReport();
/* If this was a recursive panic, it what most likely generated
* from printCrashReport. */
if (new_report) printCrashReport();
}
// remove the signal handler so on abort() we will output the crash report.
@ -1144,14 +1148,18 @@ void _serverPanic(const char *file, int line, const char *msg, ...) {
bugReportEnd(0, 0);
}
void bugReportStart(void) {
/* Start a bug report, returning 1 if this is the first time this function was called, 0 otherwise. */
int bugReportStart(void) {
pthread_mutex_lock(&bug_report_start_mutex);
if (bug_report_start == 0) {
serverLogRaw(LL_WARNING|LL_RAW,
"\n\n=== REDIS BUG REPORT START: Cut & paste starting from here ===\n");
bug_report_start = 1;
pthread_mutex_unlock(&bug_report_start_mutex);
return 1;
}
pthread_mutex_unlock(&bug_report_start_mutex);
return 0;
}
#ifdef HAVE_BACKTRACE
@ -1895,9 +1903,9 @@ static void writeStacktraces(int fd, int uplevel) {
}
#else /* __linux__*/
#endif /* __linux__ */
__attribute__ ((noinline))
static void writeStacktraces(int fd, int uplevel) {
static void writeCurrentThreadsStackTrace(int fd, int uplevel) {
void *trace[BACKTRACE_MAX_SIZE];
int trace_size = backtrace(trace, BACKTRACE_MAX_SIZE);
@ -1906,7 +1914,6 @@ static void writeStacktraces(int fd, int uplevel) {
if (write(fd,msg,strlen(msg)) == -1) {/* Avoid warning. */};
backtrace_symbols_fd(trace+uplevel, trace_size-uplevel, fd);
}
#endif /* __linux__ */
/* Logs the stack trace using the backtrace() call. This function is designed
* to be called from signal handlers safely.
@ -1916,7 +1923,7 @@ static void writeStacktraces(int fd, int uplevel) {
* __attribute__ ((noinline)) to make sure the compiler won't inline them.
*/
__attribute__ ((noinline))
void logStackTrace(void *eip, int uplevel) {
void logStackTrace(void *eip, int uplevel, int current_thread) {
int fd = openDirectLogFiledes();
char *msg;
uplevel++; /* skip this function */
@ -1935,7 +1942,17 @@ void logStackTrace(void *eip, int uplevel) {
/* Write symbols to log file */
++uplevel;
writeStacktraces(fd, uplevel);
#ifdef __linux__
if (current_thread) {
writeCurrentThreadsStackTrace(fd, uplevel);
} else {
writeStacktraces(fd, uplevel);
}
#else
/* Outside of linux, we only support writing the current thread. */
UNUSED(current_thread);
writeCurrentThreadsStackTrace(fd, uplevel);
#endif
msg = "\n------ STACK TRACE DONE ------\n";
if (write(fd,msg,strlen(msg)) == -1) {/* Avoid warning. */};
@ -2218,15 +2235,14 @@ __attribute__ ((noinline))
static void sigsegvHandler(int sig, siginfo_t *info, void *secret) {
UNUSED(secret);
UNUSED(info);
int print_full_crash_info = 1;
/* Check if it is safe to enter the signal handler. second thread crashing at the same time will deadlock. */
if(pthread_mutex_lock(&signal_handler_lock) == EDEADLK) {
/* If this thread already owns the lock (meaning we crashed during handling a signal)
* log that the crash report can't be generated. */
/* If this thread already owns the lock (meaning we crashed during handling a signal) switch
* to printing the minimal information about the crash. */
serverLogRawFromHandler(LL_WARNING,
"Crashed running signal handler. Can't continue to generate the crash report");
/* gracefully exit */
bugReportEnd(1, sig);
return;
"Crashed running signal handler. Providing reduced version of recursive crash report.");
print_full_crash_info = 0;
}
bugReportStart();
@ -2260,7 +2276,9 @@ static void sigsegvHandler(int sig, siginfo_t *info, void *secret) {
getAndSetMcontextEip(uc, ptr);
}
logStackTrace(eip, 1);
/* When printing the reduced crash info, just print the current thread
* to avoid race conditions with the multi-threaded stack collector. */
logStackTrace(eip, 1, !print_full_crash_info);
if (eip == info->si_addr) {
/* Restore old eip */
@ -2270,7 +2288,7 @@ static void sigsegvHandler(int sig, siginfo_t *info, void *secret) {
logRegisters(uc);
#endif
printCrashReport();
if (print_full_crash_info) printCrashReport();
#ifdef HAVE_BACKTRACE
if (eip != NULL)
@ -2430,7 +2448,7 @@ void sigalrmSignalHandler(int sig, siginfo_t *info, void *secret) {
serverLogRawFromHandler(LL_WARNING, "\nReceived SIGALRM");
}
#ifdef HAVE_BACKTRACE
logStackTrace(getAndSetMcontextEip(uc, NULL), 1);
logStackTrace(getAndSetMcontextEip(uc, NULL), 1, 0);
#else
serverLogRawFromHandler(LL_WARNING,"Sorry: no support for backtrace().");
#endif

View File

@ -4,10 +4,10 @@ uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
warning_cflags = -W -Wall -Wno-missing-field-initializers
ifeq ($(uname_S),Darwin)
SHOBJ_CFLAGS ?= $(warning_cflags) -dynamic -fno-common -g -ggdb -std=c99 -O2
SHOBJ_CFLAGS ?= $(warning_cflags) -dynamic -fno-common -g -ggdb -std=gnu11 -O2
SHOBJ_LDFLAGS ?= -bundle -undefined dynamic_lookup
else # Linux, others
SHOBJ_CFLAGS ?= $(warning_cflags) -fno-common -g -ggdb -std=c99 -O2
SHOBJ_CFLAGS ?= $(warning_cflags) -fno-common -g -ggdb -std=gnu11 -O2
SHOBJ_LDFLAGS ?= -shared
endif
@ -62,7 +62,8 @@ TEST_MODULES = \
usercall.so \
postnotifications.so \
moduleauthtwo.so \
rdbloadsave.so
rdbloadsave.so \
crash.so
.PHONY: all

39
tests/modules/crash.c Normal file
View File

@ -0,0 +1,39 @@
#include "redismodule.h"
#include <strings.h>
#include <sys/mman.h>
#define UNUSED(V) ((void) V)
void assertCrash(RedisModuleInfoCtx *ctx, int for_crash_report) {
UNUSED(ctx);
UNUSED(for_crash_report);
RedisModule_Assert(0);
}
void segfaultCrash(RedisModuleInfoCtx *ctx, int for_crash_report) {
UNUSED(ctx);
UNUSED(for_crash_report);
/* Compiler gives warnings about writing to a random address
* e.g "*((char*)-1) = 'x';". As a workaround, we map a read-only area
* and try to write there to trigger segmentation fault. */
char *p = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
*p = 'x';
}
int RedisModule_OnLoad(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
REDISMODULE_NOT_USED(argv);
REDISMODULE_NOT_USED(argc);
if (RedisModule_Init(ctx,"infocrash",1,REDISMODULE_APIVER_1)
== REDISMODULE_ERR) return REDISMODULE_ERR;
RedisModule_Assert(argc == 1);
if (!strcasecmp(RedisModule_StringPtrLen(argv[0], NULL), "segfault")) {
if (RedisModule_RegisterInfoFunc(ctx, segfaultCrash) == REDISMODULE_ERR) return REDISMODULE_ERR;
} else if(!strcasecmp(RedisModule_StringPtrLen(argv[0], NULL), "assert")) {
if (RedisModule_RegisterInfoFunc(ctx, assertCrash) == REDISMODULE_ERR) return REDISMODULE_ERR;
} else {
return REDISMODULE_ERR;
}
return REDISMODULE_OK;
}

View File

@ -0,0 +1,52 @@
# This file is used to test certain crash edge cases to make sure they produce
# correct stack traces for debugging.
set testmodule [file normalize tests/modules/crash.so]
# Valgrind will complain that the process terminated by a signal, skip it.
if {!$::valgrind} {
start_server {tags {"modules"}} {
r module load $testmodule assert
test {Test module crash when info crashes with an assertion } {
catch {r 0 info infocrash}
set res [wait_for_log_messages 0 {"*=== REDIS BUG REPORT START: Cut & paste starting from here ===*"} 0 10 1000]
set loglines [lindex $res 1]
set res [wait_for_log_messages 0 {"*ASSERTION FAILED*"} $loglines 10 1000]
set loglines [lindex $res 1]
set res [wait_for_log_messages 0 {"*RECURSIVE ASSERTION FAILED*"} $loglines 10 1000]
set loglines [lindex $res 1]
wait_for_log_messages 0 {"*=== REDIS BUG REPORT END. Make sure to include from START to END. ===*"} $loglines 10 1000
assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT END. Make sure to include from START to END. ==="]
assert_equal 2 [count_log_message 0 "ASSERTION FAILED"]
# There will be 3 crash assertions, 1 in the first stack trace and 2 in the second
assert_equal 3 [count_log_message 0 "assertCrash"]
assert_equal 1 [count_log_message 0 "RECURSIVE ASSERTION FAILED"]
assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT START: Cut & paste starting from here ==="]
}
}
start_server {tags {"modules"}} {
r module load $testmodule segfault
test {Test module crash when info crashes with a segfault} {
catch {r 0 info infocrash}
set res [wait_for_log_messages 0 {"*=== REDIS BUG REPORT START: Cut & paste starting from here ===*"} 0 10 1000]
set loglines [lindex $res 1]
set res [wait_for_log_messages 0 {"*Crashed running the instruction at*"} $loglines 10 1000]
set loglines [lindex $res 1]
set res [wait_for_log_messages 0 {"*Crashed running signal handler. Providing reduced version of recursive crash report*"} $loglines 10 1000]
set loglines [lindex $res 1]
set res [wait_for_log_messages 0 {"*Crashed running the instruction at*"} $loglines 10 1000]
set loglines [lindex $res 1]
wait_for_log_messages 0 {"*=== REDIS BUG REPORT END. Make sure to include from START to END. ===*"} $loglines 10 1000
assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT END. Make sure to include from START to END. ==="]
assert_equal 1 [count_log_message 0 "Crashed running signal handler. Providing reduced version of recursive crash report"]
assert_equal 2 [count_log_message 0 "Crashed running the instruction at"]
assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT START: Cut & paste starting from here ==="]
}
}
}