From 266949c7fcfab9d10f81314fd7480a00638ced80 Mon Sep 17 00:00:00 2001 From: Greg Femec Date: Wed, 23 Dec 2020 05:52:07 -0800 Subject: [PATCH] Fix random element selection for large hash tables. (#8133) When a database on a 64 bit build grows past 2^31 keys, the underlying hash table expands to 2^32 buckets. After this point, the algorithms for selecting random elements only return elements from half of the available buckets because they use random() which has a range of 0 to 2^31 - 1. This causes problems for eviction policies which use dictGetSomeKeys or dictGetRandomKey. Over time they cause the hash table to become unbalanced because, while new keys are spread out evenly across all buckets, evictions come from only half of the available buckets. Eventually this half of the table starts to run out of keys and it takes longer and longer to find candidates for eviction. This continues until no more evictions can happen. This solution addresses this by using a 64 bit PRNG instead of libc random(). Co-authored-by: Greg Femec --- src/Makefile | 8 +- src/dict.c | 15 +++- src/dict.h | 14 +++- src/mt19937-64.c | 187 ++++++++++++++++++++++++++++++++++++++++++ src/mt19937-64.h | 87 ++++++++++++++++++++ src/redis-benchmark.c | 2 + src/redis-check-rdb.c | 9 ++ src/redis-cli.c | 5 ++ src/server.c | 2 + 9 files changed, 319 insertions(+), 10 deletions(-) create mode 100644 src/mt19937-64.c create mode 100644 src/mt19937-64.h diff --git a/src/Makefile b/src/Makefile index 068dbc9da..3bc9f11c0 100644 --- a/src/Makefile +++ b/src/Makefile @@ -270,11 +270,11 @@ endif REDIS_SERVER_NAME=redis-server$(PROG_SUFFIX) REDIS_SENTINEL_NAME=redis-sentinel$(PROG_SUFFIX) -REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o gopher.o tracking.o connection.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o +REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o gopher.o tracking.o connection.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o REDIS_CLI_NAME=redis-cli$(PROG_SUFFIX) -REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o +REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o REDIS_BENCHMARK_NAME=redis-benchmark$(PROG_SUFFIX) -REDIS_BENCHMARK_OBJ=ae.o anet.o redis-benchmark.o adlist.o dict.o zmalloc.o release.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o +REDIS_BENCHMARK_OBJ=ae.o anet.o redis-benchmark.o adlist.o dict.o zmalloc.o release.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o REDIS_CHECK_RDB_NAME=redis-check-rdb$(PROG_SUFFIX) REDIS_CHECK_AOF_NAME=redis-check-aof$(PROG_SUFFIX) @@ -346,7 +346,7 @@ $(REDIS_CLI_NAME): $(REDIS_CLI_OBJ) $(REDIS_BENCHMARK_NAME): $(REDIS_BENCHMARK_OBJ) $(REDIS_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/hdr_histogram/hdr_histogram.o $(FINAL_LIBS) -dict-benchmark: dict.c zmalloc.c sds.c siphash.c +dict-benchmark: dict.c zmalloc.c sds.c siphash.c mt19937-64.c $(REDIS_CC) $(FINAL_CFLAGS) $^ -D DICT_BENCHMARK_MAIN -o $@ $(FINAL_LIBS) DEP = $(REDIS_SERVER_OBJ:%.o=%.d) $(REDIS_CLI_OBJ:%.o=%.d) $(REDIS_BENCHMARK_OBJ:%.o=%.d) diff --git a/src/dict.c b/src/dict.c index 4f0916a27..6c203b850 100644 --- a/src/dict.c +++ b/src/dict.c @@ -646,13 +646,13 @@ dictEntry *dictGetRandomKey(dict *d) do { /* We are sure there are no elements in indexes from 0 * to rehashidx-1 */ - h = d->rehashidx + (random() % (dictSlots(d) - d->rehashidx)); + h = d->rehashidx + (randomULong() % (dictSlots(d) - d->rehashidx)); he = (h >= d->ht[0].size) ? d->ht[1].table[h - d->ht[0].size] : d->ht[0].table[h]; } while(he == NULL); } else { do { - h = random() & d->ht[0].sizemask; + h = randomULong() & d->ht[0].sizemask; he = d->ht[0].table[h]; } while(he == NULL); } @@ -718,7 +718,7 @@ unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count) { maxsizemask = d->ht[1].sizemask; /* Pick a random point inside the larger table. */ - unsigned long i = random() & maxsizemask; + unsigned long i = randomULong() & maxsizemask; unsigned long emptylen = 0; /* Continuous empty entries so far. */ while(stored < count && maxsteps--) { for (j = 0; j < tables; j++) { @@ -743,7 +743,7 @@ unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count) { if (he == NULL) { emptylen++; if (emptylen >= 5 && emptylen > count) { - i = random() & maxsizemask; + i = randomULong() & maxsizemask; emptylen = 0; } } else { @@ -1270,6 +1270,13 @@ int main(int argc, char **argv) { } end_benchmark("Random access of existing elements"); + start_benchmark(); + for (j = 0; j < count; j++) { + dictEntry *de = dictGetRandomKey(dict); + assert(de != NULL); + } + end_benchmark("Accessing random keys"); + start_benchmark(); for (j = 0; j < count; j++) { sds key = sdsfromlonglong(rand() % count); diff --git a/src/dict.h b/src/dict.h index f7515e905..d96c3148f 100644 --- a/src/dict.h +++ b/src/dict.h @@ -33,11 +33,14 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#include - #ifndef __DICT_H #define __DICT_H +#include "mt19937-64.h" +#include +#include +#include + #define DICT_OK 0 #define DICT_ERR 1 @@ -148,6 +151,13 @@ typedef void (dictScanBucketFunction)(void *privdata, dictEntry **bucketref); #define dictSize(d) ((d)->ht[0].used+(d)->ht[1].used) #define dictIsRehashing(d) ((d)->rehashidx != -1) +/* If our unsigned long type can store a 64 bit number, use a 64 bit PRNG. */ +#if ULONG_MAX >= 0xffffffffffffffff +#define randomULong() ((unsigned long) genrand64_int64()) +#else +#define randomULong() random() +#endif + /* API */ dict *dictCreate(dictType *type, void *privDataPtr); int dictExpand(dict *d, unsigned long size); diff --git a/src/mt19937-64.c b/src/mt19937-64.c new file mode 100644 index 000000000..a0c897ff6 --- /dev/null +++ b/src/mt19937-64.c @@ -0,0 +1,187 @@ +/* + A C-program for MT19937-64 (2004/9/29 version). + Coded by Takuji Nishimura and Makoto Matsumoto. + + This is a 64-bit version of Mersenne Twister pseudorandom number + generator. + + Before using, initialize the state by using init_genrand64(seed) + or init_by_array64(init_key, key_length). + + Copyright (C) 2004, Makoto Matsumoto and Takuji Nishimura, + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. The names of its contributors may not be used to endorse or promote + products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + References: + T. Nishimura, ``Tables of 64-bit Mersenne Twisters'' + ACM Transactions on Modeling and + Computer Simulation 10. (2000) 348--357. + M. Matsumoto and T. Nishimura, + ``Mersenne Twister: a 623-dimensionally equidistributed + uniform pseudorandom number generator'' + ACM Transactions on Modeling and + Computer Simulation 8. (Jan. 1998) 3--30. + + Any feedback is very welcome. + http://www.math.hiroshima-u.ac.jp/~m-mat/MT/emt.html + email: m-mat @ math.sci.hiroshima-u.ac.jp (remove spaces) +*/ + + +#include "mt19937-64.h" +#include + +#define NN 312 +#define MM 156 +#define MATRIX_A 0xB5026F5AA96619E9ULL +#define UM 0xFFFFFFFF80000000ULL /* Most significant 33 bits */ +#define LM 0x7FFFFFFFULL /* Least significant 31 bits */ + + +/* The array for the state vector */ +static unsigned long long mt[NN]; +/* mti==NN+1 means mt[NN] is not initialized */ +static int mti=NN+1; + +/* initializes mt[NN] with a seed */ +void init_genrand64(unsigned long long seed) +{ + mt[0] = seed; + for (mti=1; mti> 62)) + mti); +} + +/* initialize by an array with array-length */ +/* init_key is the array for initializing keys */ +/* key_length is its length */ +void init_by_array64(unsigned long long init_key[], + unsigned long long key_length) +{ + unsigned long long i, j, k; + init_genrand64(19650218ULL); + i=1; j=0; + k = (NN>key_length ? NN : key_length); + for (; k; k--) { + mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 62)) * 3935559000370003845ULL)) + + init_key[j] + j; /* non linear */ + i++; j++; + if (i>=NN) { mt[0] = mt[NN-1]; i=1; } + if (j>=key_length) j=0; + } + for (k=NN-1; k; k--) { + mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 62)) * 2862933555777941757ULL)) + - i; /* non linear */ + i++; + if (i>=NN) { mt[0] = mt[NN-1]; i=1; } + } + + mt[0] = 1ULL << 63; /* MSB is 1; assuring non-zero initial array */ +} + +/* generates a random number on [0, 2^64-1]-interval */ +unsigned long long genrand64_int64(void) +{ + int i; + unsigned long long x; + static unsigned long long mag01[2]={0ULL, MATRIX_A}; + + if (mti >= NN) { /* generate NN words at one time */ + + /* if init_genrand64() has not been called, */ + /* a default initial seed is used */ + if (mti == NN+1) + init_genrand64(5489ULL); + + for (i=0;i>1) ^ mag01[(int)(x&1ULL)]; + } + for (;i>1) ^ mag01[(int)(x&1ULL)]; + } + x = (mt[NN-1]&UM)|(mt[0]&LM); + mt[NN-1] = mt[MM-1] ^ (x>>1) ^ mag01[(int)(x&1ULL)]; + + mti = 0; + } + + x = mt[mti++]; + + x ^= (x >> 29) & 0x5555555555555555ULL; + x ^= (x << 17) & 0x71D67FFFEDA60000ULL; + x ^= (x << 37) & 0xFFF7EEE000000000ULL; + x ^= (x >> 43); + + return x; +} + +/* generates a random number on [0, 2^63-1]-interval */ +long long genrand64_int63(void) +{ + return (long long)(genrand64_int64() >> 1); +} + +/* generates a random number on [0,1]-real-interval */ +double genrand64_real1(void) +{ + return (genrand64_int64() >> 11) * (1.0/9007199254740991.0); +} + +/* generates a random number on [0,1)-real-interval */ +double genrand64_real2(void) +{ + return (genrand64_int64() >> 11) * (1.0/9007199254740992.0); +} + +/* generates a random number on (0,1)-real-interval */ +double genrand64_real3(void) +{ + return ((genrand64_int64() >> 12) + 0.5) * (1.0/4503599627370496.0); +} + +#ifdef MT19937_64_MAIN +int main(void) +{ + int i; + unsigned long long init[4]={0x12345ULL, 0x23456ULL, 0x34567ULL, 0x45678ULL}, length=4; + init_by_array64(init, length); + printf("1000 outputs of genrand64_int64()\n"); + for (i=0; i<1000; i++) { + printf("%20llu ", genrand64_int64()); + if (i%5==4) printf("\n"); + } + printf("\n1000 outputs of genrand64_real2()\n"); + for (i=0; i<1000; i++) { + printf("%10.8f ", genrand64_real2()); + if (i%5==4) printf("\n"); + } + return 0; +} +#endif diff --git a/src/mt19937-64.h b/src/mt19937-64.h new file mode 100644 index 000000000..b98348fd4 --- /dev/null +++ b/src/mt19937-64.h @@ -0,0 +1,87 @@ +/* + A C-program for MT19937-64 (2004/9/29 version). + Coded by Takuji Nishimura and Makoto Matsumoto. + + This is a 64-bit version of Mersenne Twister pseudorandom number + generator. + + Before using, initialize the state by using init_genrand64(seed) + or init_by_array64(init_key, key_length). + + Copyright (C) 2004, Makoto Matsumoto and Takuji Nishimura, + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. The names of its contributors may not be used to endorse or promote + products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + References: + T. Nishimura, ``Tables of 64-bit Mersenne Twisters'' + ACM Transactions on Modeling and + Computer Simulation 10. (2000) 348--357. + M. Matsumoto and T. Nishimura, + ``Mersenne Twister: a 623-dimensionally equidistributed + uniform pseudorandom number generator'' + ACM Transactions on Modeling and + Computer Simulation 8. (Jan. 1998) 3--30. + + Any feedback is very welcome. + http://www.math.hiroshima-u.ac.jp/~m-mat/MT/emt.html + email: m-mat @ math.sci.hiroshima-u.ac.jp (remove spaces) +*/ + +#ifndef __MT19937_64_H +#define __MT19937_64_H + +/* initializes mt[NN] with a seed */ +void init_genrand64(unsigned long long seed); + +/* initialize by an array with array-length */ +/* init_key is the array for initializing keys */ +/* key_length is its length */ +void init_by_array64(unsigned long long init_key[], + unsigned long long key_length); + +/* generates a random number on [0, 2^64-1]-interval */ +unsigned long long genrand64_int64(void); + + +/* generates a random number on [0, 2^63-1]-interval */ +long long genrand64_int63(void); + +/* generates a random number on [0,1]-real-interval */ +double genrand64_real1(void); + +/* generates a random number on [0,1)-real-interval */ +double genrand64_real2(void); + +/* generates a random number on (0,1)-real-interval */ +double genrand64_real3(void); + +/* generates a random number on (0,1]-real-interval */ +double genrand64_real4(void); + +#endif diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c index 18e19b0e0..a955c0d4c 100644 --- a/src/redis-benchmark.c +++ b/src/redis-benchmark.c @@ -59,6 +59,7 @@ #include "crc16_slottable.h" #include "hdr_histogram.h" #include "cli_common.h" +#include "mt19937-64.h" #define UNUSED(V) ((void) V) #define RANDPTR_INITIAL_SIZE 8 @@ -1677,6 +1678,7 @@ int main(int argc, const char **argv) { client c; srandom(time(NULL) ^ getpid()); + init_genrand64(ustime() ^ getpid()); signal(SIGHUP, SIG_IGN); signal(SIGPIPE, SIG_IGN); diff --git a/src/redis-check-rdb.c b/src/redis-check-rdb.c index 79dbf3229..335e35189 100644 --- a/src/redis-check-rdb.c +++ b/src/redis-check-rdb.c @@ -27,10 +27,13 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include "mt19937-64.h" #include "server.h" #include "rdb.h" #include +#include +#include void createSharedObjects(void); void rdbLoadProgressCallback(rio *r, const void *buf, size_t len); @@ -362,10 +365,16 @@ err: * Otherwise if called with a non NULL fp, the function returns C_OK or * C_ERR depending on the success or failure. */ int redis_check_rdb_main(int argc, char **argv, FILE *fp) { + struct timeval tv; + if (argc != 2 && fp == NULL) { fprintf(stderr, "Usage: %s \n", argv[0]); exit(1); } + + gettimeofday(&tv, NULL); + init_genrand64(((long long) tv.tv_sec * 1000000 + tv.tv_usec) ^ getpid()); + /* In order to call the loading functions we need to create the shared * integer objects, however since this function may be called from * an already initialized Redis instance, check if we really need to. */ diff --git a/src/redis-cli.c b/src/redis-cli.c index 5310a1b21..ff29de748 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -62,6 +62,7 @@ #include "anet.h" #include "ae.h" #include "cli_common.h" +#include "mt19937-64.h" #define UNUSED(V) ((void) V) @@ -8123,6 +8124,7 @@ static sds askPassword(const char *msg) { int main(int argc, char **argv) { int firstarg; + struct timeval tv; config.hostip = sdsnew("127.0.0.1"); config.hostport = 6379; @@ -8219,6 +8221,9 @@ int main(int argc, char **argv) { } #endif + gettimeofday(&tv, NULL); + init_genrand64(((long long) tv.tv_sec * 1000000 + tv.tv_usec) ^ getpid()); + /* Cluster Manager mode */ if (CLUSTER_MANAGER_MODE()) { clusterManagerCommandProc *proc = validateClusterManagerCommand(); diff --git a/src/server.c b/src/server.c index 4e9bb88eb..611935a6c 100644 --- a/src/server.c +++ b/src/server.c @@ -34,6 +34,7 @@ #include "bio.h" #include "latency.h" #include "atomicvar.h" +#include "mt19937-64.h" #include #include @@ -5452,6 +5453,7 @@ int main(int argc, char **argv) { srand(time(NULL)^getpid()); srandom(time(NULL)^getpid()); gettimeofday(&tv,NULL); + init_genrand64(((long long) tv.tv_sec * 1000000 + tv.tv_usec) ^ getpid()); crc64_init(); uint8_t hashseed[16];