diff --git a/src/networking.c b/src/networking.c index d3b0cd241..7ac8ed1c4 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1063,6 +1063,25 @@ void disconnectSlaves(void) { } } +/* Check if there is any other slave waiting dumping RDB finished expect me. + * This function is useful to judge current dumping RDB can be used for full + * synchronization or not. */ +int anyOtherSlaveWaitRdb(client *except_me) { + listIter li; + listNode *ln; + + listRewind(server.slaves, &li); + while((ln = listNext(&li))) { + client *slave = ln->value; + if (slave != except_me && + slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END) + { + return 1; + } + } + return 0; +} + /* Remove the specified client from global lists where the client could * be referenced, not including the Pub/Sub channels. * This is used by freeClient() and replicationCacheMaster(). */ @@ -1213,6 +1232,21 @@ void freeClient(client *c) { /* Master/slave cleanup Case 1: * we lost the connection with a slave. */ if (c->flags & CLIENT_SLAVE) { + /* If there is no any other slave waiting dumping RDB finished, the + * current child process need not continue to dump RDB, then we kill it. + * So child process won't use more memory, and we also can fork a new + * child process asap to dump rdb for next full synchronization or bgsave. + * But we also need to check if users enable 'save' RDB, if enable, we + * should not remove directly since that means RDB is important for users + * to keep data safe and we may delay configured 'save' for full sync. */ + if (server.saveparamslen == 0 && + c->replstate == SLAVE_STATE_WAIT_BGSAVE_END && + server.rdb_child_pid != -1 && + server.rdb_child_type == RDB_CHILD_TYPE_DISK && + anyOtherSlaveWaitRdb(c) == 0) + { + killRDBChild(); + } if (c->replstate == SLAVE_STATE_SEND_BULK) { if (c->repldbfd != -1) close(c->repldbfd); if (c->replpreamble) sdsfree(c->replpreamble); diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl index 5438b0eba..d2c8b8e56 100644 --- a/tests/integration/replication.tcl +++ b/tests/integration/replication.tcl @@ -752,3 +752,64 @@ test {replicaof right after disconnection} { } } } + +test {Kill rdb child process if its dumping RDB is not useful} { + start_server {tags {"repl"}} { + set slave1 [srv 0 client] + start_server {} { + set slave2 [srv 0 client] + start_server {} { + set master [srv 0 client] + set master_host [srv 0 host] + set master_port [srv 0 port] + for {set i 0} {$i < 10} {incr i} { + $master set $i $i + } + # Generating RDB will cost 10s(10 * 1s) + $master config set rdb-key-save-delay 1000000 + $master config set repl-diskless-sync no + $master config set save "" + + $slave1 slaveof $master_host $master_port + $slave2 slaveof $master_host $master_port + + # Wait for starting child + wait_for_condition 50 100 { + [s 0 rdb_bgsave_in_progress] == 1 + } else { + fail "rdb child didn't start" + } + + # Slave1 disconnect with master + $slave1 slaveof no one + # Shouldn't kill child since another slave wait for rdb + after 100 + assert {[s 0 rdb_bgsave_in_progress] == 1} + + # Slave2 disconnect with master + $slave2 slaveof no one + # Should kill child + wait_for_condition 20 10 { + [s 0 rdb_bgsave_in_progress] eq 0 + } else { + fail "can't kill rdb child" + } + + # If have save parameters, won't kill child + $master config set save "900 1" + $slave1 slaveof $master_host $master_port + $slave2 slaveof $master_host $master_port + wait_for_condition 50 100 { + [s 0 rdb_bgsave_in_progress] == 1 + } else { + fail "rdb child didn't start" + } + $slave1 slaveof no one + $slave2 slaveof no one + after 200 + assert {[s 0 rdb_bgsave_in_progress] == 1} + catch {$master shutdown nosave} + } + } + } +}