From 11436b144980037b75d1c3184d412377dd2e71ab Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 21 Jan 2016 16:57:35 +0100 Subject: [PATCH] Cluster announce ip / port initial implementation. --- src/cluster.c | 117 +++++++++++++++++++++++++++++++++++++------------- src/cluster.h | 15 ++++--- src/config.c | 24 +++++++++++ src/server.c | 5 ++- src/server.h | 8 +++- 5 files changed, 127 insertions(+), 42 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 51cd3e216..7fa120e87 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -670,6 +670,7 @@ clusterNode *createClusterNode(char *nodename, int flags) { node->link = NULL; memset(node->ip,0,sizeof(node->ip)); node->port = 0; + node->cport = 0; node->fail_reports = listCreate(); node->voted_time = 0; node->orphaned_time = 0; @@ -1212,7 +1213,7 @@ void clearNodeFailureIfNeeded(clusterNode *node) { /* Return true if we already have a node in HANDSHAKE state matching the * specified ip address and port number. This function is used in order to * avoid adding a new handshake node for the same address multiple times. */ -int clusterHandshakeInProgress(char *ip, int port) { +int clusterHandshakeInProgress(char *ip, int port, int cport) { dictIterator *di; dictEntry *de; @@ -1221,7 +1222,9 @@ int clusterHandshakeInProgress(char *ip, int port) { clusterNode *node = dictGetVal(de); if (!nodeInHandshake(node)) continue; - if (!strcasecmp(node->ip,ip) && node->port == port) break; + if (!strcasecmp(node->ip,ip) && + node->port == port && + node->cport == cport) break; } dictReleaseIterator(di); return de != NULL; @@ -1234,7 +1237,7 @@ int clusterHandshakeInProgress(char *ip, int port) { * * EAGAIN - There is already an handshake in progress for this address. * EINVAL - IP or port are not valid. */ -int clusterStartHandshake(char *ip, int port) { +int clusterStartHandshake(char *ip, int port, int cport) { clusterNode *n; char norm_ip[NET_IP_STR_LEN]; struct sockaddr_storage sa; @@ -1254,7 +1257,7 @@ int clusterStartHandshake(char *ip, int port) { } /* Port sanity check */ - if (port <= 0 || port > (65535-CLUSTER_PORT_INCR)) { + if (port <= 0 || port > 65535 || cport <= 0 || cport > 65535) { errno = EINVAL; return 0; } @@ -1271,7 +1274,7 @@ int clusterStartHandshake(char *ip, int port) { (void*)&(((struct sockaddr_in6 *)&sa)->sin6_addr), norm_ip,NET_IP_STR_LEN); - if (clusterHandshakeInProgress(norm_ip,port)) { + if (clusterHandshakeInProgress(norm_ip,port,cport)) { errno = EAGAIN; return 0; } @@ -1282,6 +1285,7 @@ int clusterStartHandshake(char *ip, int port) { n = createClusterNode(NULL,CLUSTER_NODE_HANDSHAKE|CLUSTER_NODE_MEET); memcpy(n->ip,norm_ip,sizeof(n->ip)); n->port = port; + n->cport = cport; clusterAddNode(n); return 1; } @@ -1301,10 +1305,11 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { sds ci; ci = representClusterNodeFlags(sdsempty(), flags); - serverLog(LL_DEBUG,"GOSSIP %.40s %s:%d %s", + serverLog(LL_DEBUG,"GOSSIP %.40s %s:%d:%d %s", g->nodename, g->ip, ntohs(g->port), + ntohs(g->cport), ci); sdsfree(ci); @@ -1338,11 +1343,14 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { if (node->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL) && !(flags & CLUSTER_NODE_NOADDR) && !(flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) && - (strcasecmp(node->ip,g->ip) || node->port != ntohs(g->port))) + (strcasecmp(node->ip,g->ip) || + node->port != ntohs(g->port) || + node->cport != ntohs(g->cport))) { if (node->link) freeClusterLink(node->link); memcpy(node->ip,g->ip,NET_IP_STR_LEN); node->port = ntohs(g->port); + node->cport = ntohs(g->cport); node->flags &= ~CLUSTER_NODE_NOADDR; } } else { @@ -1356,7 +1364,7 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { !(flags & CLUSTER_NODE_NOADDR) && !clusterBlacklistExists(g->nodename)) { - clusterStartHandshake(g->ip,ntohs(g->port)); + clusterStartHandshake(g->ip,ntohs(g->port),ntohs(g->cport)); } } @@ -1365,23 +1373,36 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { } } -/* IP -> string conversion. 'buf' is supposed to at least be 46 bytes. */ -void nodeIp2String(char *buf, clusterLink *link) { - anetPeerToString(link->fd, buf, NET_IP_STR_LEN, NULL); +/* IP -> string conversion. 'buf' is supposed to at least be 46 bytes. + * If 'announced_ip' length is non-zero, it is used instead of extracting + * the IP from the socket peer address. */ +void nodeIp2String(char *buf, clusterLink *link, char *announced_ip) { + if (announced_ip[0] != '\0') { + memcpy(buf,announced_ip,NET_IP_STR_LEN); + buf[NET_IP_STR_LEN-1] = '\0'; /* We are not sure the input is sane. */ + } else { + anetPeerToString(link->fd, buf, NET_IP_STR_LEN, NULL); + } } /* Update the node address to the IP address that can be extracted - * from link->fd, and at the specified port. - * Also disconnect the node link so that we'll connect again to the new - * address. + * from link->fd, or if hdr->myip is non empty, to the address the node + * is announcing us. The port is taken from the packet header as well. + * + * If the address or port changed, disconnect the node link so that we'll + * connect again to the new address. * * If the ip/port pair are already correct no operation is performed at * all. * * The function returns 0 if the node address is still the same, * otherwise 1 is returned. */ -int nodeUpdateAddressIfNeeded(clusterNode *node, clusterLink *link, int port) { +int nodeUpdateAddressIfNeeded(clusterNode *node, clusterLink *link, + clusterMsg *hdr) +{ char ip[NET_IP_STR_LEN] = {0}; + int port = ntohs(hdr->port); + int cport = ntohs(hdr->cport); /* We don't proceed if the link is the same as the sender link, as this * function is designed to see if the node link is consistent with the @@ -1391,12 +1412,14 @@ int nodeUpdateAddressIfNeeded(clusterNode *node, clusterLink *link, int port) { * it is safe to call during packet processing. */ if (link == node->link) return 0; - nodeIp2String(ip,link); - if (node->port == port && strcmp(ip,node->ip) == 0) return 0; + nodeIp2String(ip,link,hdr->myip); + if (node->port == port && node->cport == cport && + strcmp(ip,node->ip) == 0) return 0; /* IP / port is different, update it. */ memcpy(node->ip,ip,sizeof(ip)); node->port = port; + node->cport = cport; if (node->link) freeClusterLink(node->link); node->flags &= ~CLUSTER_NODE_NOADDR; serverLog(LL_WARNING,"Address updated for node %.40s, now %s:%d", @@ -1647,7 +1670,10 @@ int clusterProcessPacket(clusterLink *link) { if (type == CLUSTERMSG_TYPE_MEET || myself->ip[0] == '\0') { char ip[NET_IP_STR_LEN]; - if (anetSockName(link->fd,ip,sizeof(ip),NULL) != -1 && + if (server.cluster_announce_ip) { + strncpy(myself->ip,server.cluster_announce_ip,NET_IP_STR_LEN); + myself->ip[NET_IP_STR_LEN-1] = '\0'; + } else if (anetSockName(link->fd,ip,sizeof(ip),NULL) != -1 && strcmp(ip,myself->ip)) { memcpy(myself->ip,ip,NET_IP_STR_LEN); @@ -1665,8 +1691,9 @@ int clusterProcessPacket(clusterLink *link) { clusterNode *node; node = createClusterNode(NULL,CLUSTER_NODE_HANDSHAKE); - nodeIp2String(node->ip,link); + nodeIp2String(node->ip,link,hdr->myip); node->port = ntohs(hdr->port); + node->cport = ntohs(hdr->cport); clusterAddNode(node); clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); } @@ -1696,7 +1723,7 @@ int clusterProcessPacket(clusterLink *link) { serverLog(LL_VERBOSE, "Handshake: we already know node %.40s, " "updating the address if needed.", sender->name); - if (nodeUpdateAddressIfNeeded(sender,link,ntohs(hdr->port))) + if (nodeUpdateAddressIfNeeded(sender,link,hdr)) { clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| CLUSTER_TODO_UPDATE_STATE); @@ -1728,6 +1755,7 @@ int clusterProcessPacket(clusterLink *link) { link->node->flags |= CLUSTER_NODE_NOADDR; link->node->ip[0] = '\0'; link->node->port = 0; + link->node->cport = 0; freeClusterLink(link); clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); return 0; @@ -1737,7 +1765,7 @@ int clusterProcessPacket(clusterLink *link) { /* Update the node address if it changed. */ if (sender && type == CLUSTERMSG_TYPE_PING && !nodeInHandshake(sender) && - nodeUpdateAddressIfNeeded(sender,link,ntohs(hdr->port))) + nodeUpdateAddressIfNeeded(sender,link,hdr)) { clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| CLUSTER_TODO_UPDATE_STATE); @@ -2134,11 +2162,28 @@ void clusterBuildMessageHdr(clusterMsg *hdr, int type) { hdr->type = htons(type); memcpy(hdr->sender,myself->name,CLUSTER_NAMELEN); + /* If cluster-announce-ip option is enabled, force the receivers of our + * packets to use the specified address for this node. Otherwise if the + * first byte is zero, they'll do auto discovery. */ + memset(hdr->myip,0,NET_IP_STR_LEN); + if (server.cluster_announce_ip) { + strncpy(hdr->myip,server.cluster_announce_ip,NET_IP_STR_LEN); + hdr->myip[NET_IP_STR_LEN-1] = '\0'; + } + + /* Handle cluster-announce-port as well. */ + int announced_port = server.cluster_announce_port ? + server.cluster_announce_port : server.port; + int announced_cport = server.cluster_announce_bus_port ? + server.cluster_announce_bus_port : + (server.port + CLUSTER_PORT_INCR); + memcpy(hdr->myslots,master->slots,sizeof(hdr->myslots)); memset(hdr->slaveof,0,CLUSTER_NAMELEN); if (myself->slaveof != NULL) memcpy(hdr->slaveof,myself->slaveof->name, CLUSTER_NAMELEN); - hdr->port = htons(server.port); + hdr->port = htons(announced_port); + hdr->cport = htons(announced_cport); hdr->flags = htons(myself->flags); hdr->state = server.cluster->state; @@ -2274,9 +2319,9 @@ void clusterSendPing(clusterLink *link, int type) { gossip->pong_received = htonl(this->pong_received); memcpy(gossip->ip,this->ip,sizeof(this->ip)); gossip->port = htons(this->port); + gossip->cport = htons(this->cport); gossip->flags = htons(this->flags); gossip->notused1 = 0; - gossip->notused2 = 0; gossipcount++; } @@ -3100,7 +3145,7 @@ void clusterCron(void) { clusterLink *link; fd = anetTcpNonBlockBindConnect(server.neterr, node->ip, - node->port+CLUSTER_PORT_INCR, NET_FIRST_BIND_ADDR); + node->cport, NET_FIRST_BIND_ADDR); if (fd == -1) { /* We got a synchronous error from connect before * clusterSendPing() had a chance to be called. @@ -3110,8 +3155,7 @@ void clusterCron(void) { if (node->ping_sent == 0) node->ping_sent = mstime(); serverLog(LL_DEBUG, "Unable to connect to " "Cluster Node [%s]:%d -> %s", node->ip, - node->port+CLUSTER_PORT_INCR, - server.neterr); + node->cport, server.neterr); continue; } link = createClusterLink(node); @@ -3142,7 +3186,7 @@ void clusterCron(void) { node->flags &= ~CLUSTER_NODE_MEET; serverLog(LL_DEBUG,"Connecting with Node %.40s at %s:%d", - node->name, node->ip, node->port+CLUSTER_PORT_INCR); + node->name, node->ip, node->cport); } } dictReleaseIterator(di); @@ -3845,16 +3889,27 @@ void clusterCommand(client *c) { return; } - if (!strcasecmp(c->argv[1]->ptr,"meet") && c->argc == 4) { - long long port; + if (!strcasecmp(c->argv[1]->ptr,"meet") && (c->argc == 4 || c->argc == 5)) { + /* CLUSTER MEET [cport] */ + long long port, cport; if (getLongLongFromObject(c->argv[3], &port) != C_OK) { - addReplyErrorFormat(c,"Invalid TCP port specified: %s", + addReplyErrorFormat(c,"Invalid TCP base port specified: %s", (char*)c->argv[3]->ptr); return; } - if (clusterStartHandshake(c->argv[2]->ptr,port) == 0 && + if (c->argc == 5) { + if (getLongLongFromObject(c->argv[4], &cport) != C_OK) { + addReplyErrorFormat(c,"Invalid TCP bus port specified: %s", + (char*)c->argv[4]->ptr); + return; + } + } else { + cport = port + CLUSTER_PORT_INCR; + } + + if (clusterStartHandshake(c->argv[2]->ptr,port,cport) == 0 && errno == EINVAL) { addReplyErrorFormat(c,"Invalid node address specified: %s:%s", diff --git a/src/cluster.h b/src/cluster.h index b861dc14d..6dd69a01b 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -100,7 +100,8 @@ typedef struct clusterNode { mstime_t orphaned_time; /* Starting time of orphaned master condition */ long long repl_offset; /* Last known repl offset for this node. */ char ip[NET_IP_STR_LEN]; /* Latest known IP address of this node */ - int port; /* Latest known port of this node */ + int port; /* Latest known clients port of this node */ + int cport; /* Latest known cluster port of this node. */ clusterLink *link; /* TCP/IP link with this node */ list *fail_reports; /* List of nodes signaling this as failing */ } clusterNode; @@ -171,10 +172,10 @@ typedef struct { uint32_t ping_sent; uint32_t pong_received; char ip[NET_IP_STR_LEN]; /* IP address last time it was seen */ - uint16_t port; /* port last time it was seen */ + uint16_t port; /* base port last time it was seen */ + uint16_t cport; /* cluster port last time it was seen */ uint16_t flags; /* node->flags copy */ - uint16_t notused1; /* Some room for future improvements. */ - uint32_t notused2; + uint32_t notused1; } clusterMsgDataGossip; typedef struct { @@ -225,7 +226,7 @@ typedef struct { char sig[4]; /* Siganture "RCmb" (Redis Cluster message bus). */ uint32_t totlen; /* Total length of this message */ uint16_t ver; /* Protocol version, currently set to 0. */ - uint16_t notused0; /* 2 bytes not used. */ + uint16_t port; /* TCP base port number. */ uint16_t type; /* Message type */ uint16_t count; /* Only used for some kind of messages. */ uint64_t currentEpoch; /* The epoch accordingly to the sending node. */ @@ -239,8 +240,8 @@ typedef struct { char slaveof[CLUSTER_NAMELEN]; char myip[NET_IP_STR_LEN]; /* Sender IP, if not all zeroed. */ char notused1[34]; /* 34 bytes reserved for future usage. */ - uint16_t port; /* Sender TCP base port */ - uint16_t flags; /* Sender node flags */ + uint16_t cport; /* Sender TCP cluster bus port */ + uint16_t flags; /* Sender node flags */ unsigned char state; /* Cluster state from the POV of the sender */ unsigned char mflags[3]; /* Message flags: CLUSTERMSG_FLAG[012]_... */ union clusterMsgData data; diff --git a/src/config.c b/src/config.c index 520d23d86..e5ee5dd66 100644 --- a/src/config.c +++ b/src/config.c @@ -517,6 +517,22 @@ void loadServerConfigFromString(char *config) { } else if (!strcasecmp(argv[0],"cluster-announce-ip") && argc == 2) { zfree(server.cluster_announce_ip); server.cluster_announce_ip = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"cluster-announce-port") && argc == 2) { + server.cluster_announce_port = atoi(argv[1]); + if (server.cluster_announce_port < 0 || + server.cluster_announce_port > 65535) + { + err = "Invalid port"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"cluster-announce-bus-port") && + argc == 2) + { + server.cluster_announce_bus_port = atoi(argv[1]); + if (server.cluster_announce_bus_port < 0 || + server.cluster_announce_bus_port > 65535) + { + err = "Invalid port"; goto loaderr; + } } else if (!strcasecmp(argv[0],"cluster-require-full-coverage") && argc == 2) { @@ -969,6 +985,10 @@ void configSetCommand(client *c) { refreshGoodSlavesCount(); } config_set_numerical_field( "cluster-node-timeout",server.cluster_node_timeout,0,LLONG_MAX) { + } config_set_numerical_field( + "cluster-announce-port",server.cluster_announce_port,0,65535) { + } config_set_numerical_field( + "cluster-announce-bus-port",server.cluster_announce_bus_port,0,65535) { } config_set_numerical_field( "cluster-migration-barrier",server.cluster_migration_barrier,0,LLONG_MAX){ } config_set_numerical_field( @@ -1110,6 +1130,8 @@ void configGetCommand(client *c) { config_get_numerical_field("slowlog-max-len", server.slowlog_max_len); config_get_numerical_field("port",server.port); + config_get_numerical_field("cluster-announce-port",server.cluster_announce_port); + config_get_numerical_field("cluster-announce-bus-port",server.cluster_announce_bus_port); config_get_numerical_field("tcp-backlog",server.tcp_backlog); config_get_numerical_field("databases",server.dbnum); config_get_numerical_field("repl-ping-slave-period",server.repl_ping_slave_period); @@ -1799,6 +1821,8 @@ int rewriteConfig(char *path) { rewriteConfigYesNoOption(state,"daemonize",server.daemonize,0); rewriteConfigStringOption(state,"pidfile",server.pidfile,CONFIG_DEFAULT_PID_FILE); rewriteConfigNumericalOption(state,"port",server.port,CONFIG_DEFAULT_SERVER_PORT); + rewriteConfigNumericalOption(state,"cluster-announce-port",server.cluster_announce_port,CONFIG_DEFAULT_CLUSTER_ANNOUNCE_PORT); + rewriteConfigNumericalOption(state,"cluster-announce-bus-port",server.cluster_announce_bus_port,CONFIG_DEFAULT_CLUSTER_ANNOUNCE_BUS_PORT); rewriteConfigNumericalOption(state,"tcp-backlog",server.tcp_backlog,CONFIG_DEFAULT_TCP_BACKLOG); rewriteConfigBindOption(state); rewriteConfigStringOption(state,"unixsocket",server.unixsocket,NULL); diff --git a/src/server.c b/src/server.c index 429dfc230..5ca784c55 100644 --- a/src/server.c +++ b/src/server.c @@ -1523,8 +1523,9 @@ void initServerConfig(void) { server.cluster_slave_validity_factor = CLUSTER_DEFAULT_SLAVE_VALIDITY; server.cluster_require_full_coverage = CLUSTER_DEFAULT_REQUIRE_FULL_COVERAGE; server.cluster_configfile = zstrdup(CONFIG_DEFAULT_CLUSTER_CONFIG_FILE); - server.cluster_announce_ip = NULL; - server.cluster_announce_port = 0; + server.cluster_announce_ip = CONFIG_DEFAULT_CLUSTER_ANNOUNCE_IP; + server.cluster_announce_port = CONFIG_DEFAULT_CLUSTER_ANNOUNCE_PORT; + server.cluster_announce_bus_port = CONFIG_DEFAULT_CLUSTER_ANNOUNCE_BUS_PORT; server.migrate_cached_sockets = dictCreate(&migrateCacheDictType,NULL); server.next_client_id = 1; /* Client IDs, start from 1 .*/ server.loading_process_events_interval_bytes = (1024*1024*2); diff --git a/src/server.h b/src/server.h index 879368daf..65de6d319 100644 --- a/src/server.h +++ b/src/server.h @@ -108,6 +108,9 @@ typedef long long mstime_t; /* millisecond time type. */ #define CONFIG_DEFAULT_PID_FILE "/var/run/redis.pid" #define CONFIG_DEFAULT_SYSLOG_IDENT "redis" #define CONFIG_DEFAULT_CLUSTER_CONFIG_FILE "nodes.conf" +#define CONFIG_DEFAULT_CLUSTER_ANNOUNCE_IP NULL /* Auto detect. */ +#define CONFIG_DEFAULT_CLUSTER_ANNOUNCE_PORT 0 /* Use server.port */ +#define CONFIG_DEFAULT_CLUSTER_ANNOUNCE_BUS_PORT 0 /* Use +10000 offset. */ #define CONFIG_DEFAULT_DAEMONIZE 0 #define CONFIG_DEFAULT_UNIX_SOCKET_PERM 0 #define CONFIG_DEFAULT_TCP_KEEPALIVE 0 @@ -947,8 +950,9 @@ struct redisServer { int cluster_slave_validity_factor; /* Slave max data age for failover. */ int cluster_require_full_coverage; /* If true, put the cluster down if there is at least an uncovered slot.*/ - char *cluster_announce_ip; /* cluster-announce-ip option. */ - int cluster_announce_port; /* cluster-announce-port option. */ + char *cluster_announce_ip; /* IP address to announce on cluster bus. */ + int cluster_announce_port; /* base port to announce on cluster bus. */ + int cluster_announce_bus_port; /* bus port to announce on cluster bus. */ /* Scripting */ lua_State *lua; /* The Lua interpreter. We use just one for all clients */ client *lua_client; /* The "fake client" to query Redis from Lua */