Initial changes to issue #2427.

2015-07-22 12:45:14 +02:00 · 2015-07-22 12:45:14 +02:00 · 81428a24a8
parent eb706b4202
commit 81428a24a8
3 changed files with 55 additions and 11 deletions
--- a/redis.conf
+++ b/redis.conf
@ -318,8 +318,34 @@ repl-diskless-sync no
 # it entirely just set it to 0 seconds and the transfer will start ASAP.
 repl-diskless-sync-delay 5

-# Enable diskless replication on slave side. 
-# Load RDB directly from the socket rather than saving it to disk first.
+# Enable diskless replication on slave side.
+#
+# When this option is on, the slave loads the RDB directly from the socket
+# rather than saving it to disk first. However there are data loss risks
+# associated with this feature, so make sure to read the following WARNING
+# section.
+#
+# WARNING: Note that this means that the dataset in the slave gets flushed
+# before the slave is actually sure the RDB transfer is complete, so if the
+# replication link is disconnected after the slave already flushed away its
+# dataset, but before successfully loading the new one, the slave will
+# remain empty (for all the time needed to attempt a new synchornization with
+# the master).
+#
+# This means that you should carefully consider the effects of this feature
+# on slaves that may be promoted to masters:
+#
+# 1) Sentinel checks the disconnection time and the offset of slaves before
+#    promotion. However it is possible that after the check, the slave
+#    attempts to connect with the master again and flushes its dataset.
+#    In order to run Sentinel safely in this setup, make sure to enable
+#    the "slave-protected-restart" option.
+#
+# 2) Redis Cluster slaves will refuse to try to be promoted to masters if
+#    if the dataset was flushed, so this is safe in the context of Redis Cluster.
+#
+# 3) If you are using your own HA setup, make sure to enable slave
+#    "slave-protected-restart".
 repl-diskless-load no

 # Slaves send PINGs to server in a predefined interval. It's possible to change
--- a/src/replication.c
+++ b/src/replication.c
@ -441,9 +441,14 @@ need_full_resync:
 * socket target depending on the configuration, and making sure that
 * the script cache is flushed before to start.
 *
- * Returns REDIS_OK on success or REDIS_ERR otherwise. */
-int startBgsaveForReplication(int use_eof) {
+ * Returns REDIS_OK on success or REDIS_ERR otherwise.
+ *
+ * The caller should pass '1' as the function argument if all the slaves
+ * currently waiting for a BGSAVE all claimed to support the EOF-style
+ * streaming format for RDB transfer. Otherwise it should be '0'. */
+int startBgsaveForReplication(int all_slaves_supprot_eof) {
    int retval;
+    int use_eof = all_slaves_support_eof && server.repl_diskless_sync;

    redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC with target: %s",
        use_eof ? "slaves sockets" : "disk");
@ -808,7 +813,8 @@ void updateSlavesWaitingBgsave(int bgsaveerr, int type) {
        }
    }
    if (slaves_waiting_eof || slaves_waiting_noneof) {
-        /* if there is at least one slave that doesn't support EOF, we'll start an non-eof replication */
+        /* if there is at least one slave that doesn't support EOF, we'll
+         * start an non-eof replication */
        if (startBgsaveForReplication(slaves_waiting_noneof==0) != REDIS_OK) {
            listIter li;

@ -1054,6 +1060,17 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) {
            redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
            replicationAbortSyncTransfer();
            rioFreeFd(&rdb, NULL);
+            /* Remove the half-loaded data, and load back the old dataset
+             * if we have persistence turned on.
+             *
+             * TODO:
+             * 1) Actually allow rdbLoadRio() to don't fail with exit().
+             * 2) Load RDB / AOF.
+             *
+             * Right now this code path is not entered when the connection
+             * breaks between master and slave AFAIK.
+             */
+            emptyDb(NULL);
            return;
        }
        if (usemark) {
@ -1379,7 +1396,7 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
        }
        sdsfree(err);
    }
-    
+
    /* Inform the master that this slave supports EOF marker of diskless-sync */
    {
        err = sendSynchronousCommand(fd,"REPLCONF","eof-supported","yes",
@ -2174,9 +2191,10 @@ void replicationCron(void) {

        if ((slaves_waiting_eof || slaves_waiting_noneof) && max_idle > server.repl_diskless_sync_delay) {
            /* Start a BGSAVE. Usually with socket target, or with disk target
-             * if there was a recent socket -> disk config change. 
-             * if there is at least one slave that doesn't support EOF, we'll start an non-eof replication */
-            if (startBgsaveForReplication(slaves_waiting_noneof==0) == REDIS_OK) {
+             * if there was a recent socket -> disk config change.
+             * if there is at least one slave that doesn't support EOF, we'll
+             * start an non-eof replication */
+            if (startBgsaveForReplication(slaves_waiting_noneof==0) == REDIS_OK){
                /* It started! We need to change the state of slaves
                 * from WAIT_BGSAVE_START to WAIT_BGSAVE_END in case
                 * the current target is disk. Otherwise it was already done
--- a/src/rio.c
+++ b/src/rio.c
@ -173,13 +173,13 @@ static size_t rioFdRead(rio *r, void *buf, size_t len) {
    /* if the buffer is too small for the entire request: realloc */
    if (sdslen(r->io.fd.buf) + sdsavail(r->io.fd.buf) < len)
        r->io.fd.buf = sdsMakeRoomFor(r->io.fd.buf, len - sdslen(r->io.fd.buf));
-        
+
    /* if the remaining unused buffer is not large enough: memmove so that we can read the rest */
    if (len > avail && sdsavail(r->io.fd.buf) < len - avail) {
        sdsrange(r->io.fd.buf, r->io.fd.pos, -1);
        r->io.fd.pos = 0;
    }
-    
+
    /* if we don't already have all the data in the sds, read more */
    while (len > sdslen(r->io.fd.buf) - r->io.fd.pos) {
        size_t toread = len - (sdslen(r->io.fd.buf) - r->io.fd.pos);