Fix random element selection for large hash tables. (#8133)

When a database on a 64 bit build grows past 2^31 keys, the underlying hash table expands to 2^32 buckets. After this point, the algorithms for selecting random elements only return elements from half of the available buckets because they use random() which has a range of 0 to 2^31 - 1. This causes problems for eviction policies which use dictGetSomeKeys or dictGetRandomKey. Over time they cause the hash table to become unbalanced because, while new keys are spread out evenly across all buckets, evictions come from only half of the available buckets. Eventually this half of the table starts to run out of keys and it takes longer and longer to find candidates for eviction. This continues until no more evictions can happen.

This solution addresses this by using a 64 bit PRNG instead of libc random().

Co-authored-by: Greg Femec <gfemec@google.com>
This commit is contained in:
Greg Femec 2020-12-23 05:52:07 -08:00 committed by GitHub
parent 2426aaa099
commit 266949c7fc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 319 additions and 10 deletions

View File

@ -270,11 +270,11 @@ endif
REDIS_SERVER_NAME=redis-server$(PROG_SUFFIX)
REDIS_SENTINEL_NAME=redis-sentinel$(PROG_SUFFIX)
REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o gopher.o tracking.o connection.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o
REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crcspeed.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o gopher.o tracking.o connection.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o
REDIS_CLI_NAME=redis-cli$(PROG_SUFFIX)
REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o
REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o
REDIS_BENCHMARK_NAME=redis-benchmark$(PROG_SUFFIX)
REDIS_BENCHMARK_OBJ=ae.o anet.o redis-benchmark.o adlist.o dict.o zmalloc.o release.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o
REDIS_BENCHMARK_OBJ=ae.o anet.o redis-benchmark.o adlist.o dict.o zmalloc.o release.o crcspeed.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o
REDIS_CHECK_RDB_NAME=redis-check-rdb$(PROG_SUFFIX)
REDIS_CHECK_AOF_NAME=redis-check-aof$(PROG_SUFFIX)
@ -346,7 +346,7 @@ $(REDIS_CLI_NAME): $(REDIS_CLI_OBJ)
$(REDIS_BENCHMARK_NAME): $(REDIS_BENCHMARK_OBJ)
$(REDIS_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/hdr_histogram/hdr_histogram.o $(FINAL_LIBS)
dict-benchmark: dict.c zmalloc.c sds.c siphash.c
dict-benchmark: dict.c zmalloc.c sds.c siphash.c mt19937-64.c
$(REDIS_CC) $(FINAL_CFLAGS) $^ -D DICT_BENCHMARK_MAIN -o $@ $(FINAL_LIBS)
DEP = $(REDIS_SERVER_OBJ:%.o=%.d) $(REDIS_CLI_OBJ:%.o=%.d) $(REDIS_BENCHMARK_OBJ:%.o=%.d)

View File

@ -646,13 +646,13 @@ dictEntry *dictGetRandomKey(dict *d)
do {
/* We are sure there are no elements in indexes from 0
* to rehashidx-1 */
h = d->rehashidx + (random() % (dictSlots(d) - d->rehashidx));
h = d->rehashidx + (randomULong() % (dictSlots(d) - d->rehashidx));
he = (h >= d->ht[0].size) ? d->ht[1].table[h - d->ht[0].size] :
d->ht[0].table[h];
} while(he == NULL);
} else {
do {
h = random() & d->ht[0].sizemask;
h = randomULong() & d->ht[0].sizemask;
he = d->ht[0].table[h];
} while(he == NULL);
}
@ -718,7 +718,7 @@ unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count) {
maxsizemask = d->ht[1].sizemask;
/* Pick a random point inside the larger table. */
unsigned long i = random() & maxsizemask;
unsigned long i = randomULong() & maxsizemask;
unsigned long emptylen = 0; /* Continuous empty entries so far. */
while(stored < count && maxsteps--) {
for (j = 0; j < tables; j++) {
@ -743,7 +743,7 @@ unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count) {
if (he == NULL) {
emptylen++;
if (emptylen >= 5 && emptylen > count) {
i = random() & maxsizemask;
i = randomULong() & maxsizemask;
emptylen = 0;
}
} else {
@ -1270,6 +1270,13 @@ int main(int argc, char **argv) {
}
end_benchmark("Random access of existing elements");
start_benchmark();
for (j = 0; j < count; j++) {
dictEntry *de = dictGetRandomKey(dict);
assert(de != NULL);
}
end_benchmark("Accessing random keys");
start_benchmark();
for (j = 0; j < count; j++) {
sds key = sdsfromlonglong(rand() % count);

View File

@ -33,11 +33,14 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdint.h>
#ifndef __DICT_H
#define __DICT_H
#include "mt19937-64.h"
#include <limits.h>
#include <stdint.h>
#include <stdlib.h>
#define DICT_OK 0
#define DICT_ERR 1
@ -148,6 +151,13 @@ typedef void (dictScanBucketFunction)(void *privdata, dictEntry **bucketref);
#define dictSize(d) ((d)->ht[0].used+(d)->ht[1].used)
#define dictIsRehashing(d) ((d)->rehashidx != -1)
/* If our unsigned long type can store a 64 bit number, use a 64 bit PRNG. */
#if ULONG_MAX >= 0xffffffffffffffff
#define randomULong() ((unsigned long) genrand64_int64())
#else
#define randomULong() random()
#endif
/* API */
dict *dictCreate(dictType *type, void *privDataPtr);
int dictExpand(dict *d, unsigned long size);

187
src/mt19937-64.c Normal file
View File

@ -0,0 +1,187 @@
/*
A C-program for MT19937-64 (2004/9/29 version).
Coded by Takuji Nishimura and Makoto Matsumoto.
This is a 64-bit version of Mersenne Twister pseudorandom number
generator.
Before using, initialize the state by using init_genrand64(seed)
or init_by_array64(init_key, key_length).
Copyright (C) 2004, Makoto Matsumoto and Takuji Nishimura,
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. The names of its contributors may not be used to endorse or promote
products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
References:
T. Nishimura, ``Tables of 64-bit Mersenne Twisters''
ACM Transactions on Modeling and
Computer Simulation 10. (2000) 348--357.
M. Matsumoto and T. Nishimura,
``Mersenne Twister: a 623-dimensionally equidistributed
uniform pseudorandom number generator''
ACM Transactions on Modeling and
Computer Simulation 8. (Jan. 1998) 3--30.
Any feedback is very welcome.
http://www.math.hiroshima-u.ac.jp/~m-mat/MT/emt.html
email: m-mat @ math.sci.hiroshima-u.ac.jp (remove spaces)
*/
#include "mt19937-64.h"
#include <stdio.h>
#define NN 312
#define MM 156
#define MATRIX_A 0xB5026F5AA96619E9ULL
#define UM 0xFFFFFFFF80000000ULL /* Most significant 33 bits */
#define LM 0x7FFFFFFFULL /* Least significant 31 bits */
/* The array for the state vector */
static unsigned long long mt[NN];
/* mti==NN+1 means mt[NN] is not initialized */
static int mti=NN+1;
/* initializes mt[NN] with a seed */
void init_genrand64(unsigned long long seed)
{
mt[0] = seed;
for (mti=1; mti<NN; mti++)
mt[mti] = (6364136223846793005ULL * (mt[mti-1] ^ (mt[mti-1] >> 62)) + mti);
}
/* initialize by an array with array-length */
/* init_key is the array for initializing keys */
/* key_length is its length */
void init_by_array64(unsigned long long init_key[],
unsigned long long key_length)
{
unsigned long long i, j, k;
init_genrand64(19650218ULL);
i=1; j=0;
k = (NN>key_length ? NN : key_length);
for (; k; k--) {
mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 62)) * 3935559000370003845ULL))
+ init_key[j] + j; /* non linear */
i++; j++;
if (i>=NN) { mt[0] = mt[NN-1]; i=1; }
if (j>=key_length) j=0;
}
for (k=NN-1; k; k--) {
mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 62)) * 2862933555777941757ULL))
- i; /* non linear */
i++;
if (i>=NN) { mt[0] = mt[NN-1]; i=1; }
}
mt[0] = 1ULL << 63; /* MSB is 1; assuring non-zero initial array */
}
/* generates a random number on [0, 2^64-1]-interval */
unsigned long long genrand64_int64(void)
{
int i;
unsigned long long x;
static unsigned long long mag01[2]={0ULL, MATRIX_A};
if (mti >= NN) { /* generate NN words at one time */
/* if init_genrand64() has not been called, */
/* a default initial seed is used */
if (mti == NN+1)
init_genrand64(5489ULL);
for (i=0;i<NN-MM;i++) {
x = (mt[i]&UM)|(mt[i+1]&LM);
mt[i] = mt[i+MM] ^ (x>>1) ^ mag01[(int)(x&1ULL)];
}
for (;i<NN-1;i++) {
x = (mt[i]&UM)|(mt[i+1]&LM);
mt[i] = mt[i+(MM-NN)] ^ (x>>1) ^ mag01[(int)(x&1ULL)];
}
x = (mt[NN-1]&UM)|(mt[0]&LM);
mt[NN-1] = mt[MM-1] ^ (x>>1) ^ mag01[(int)(x&1ULL)];
mti = 0;
}
x = mt[mti++];
x ^= (x >> 29) & 0x5555555555555555ULL;
x ^= (x << 17) & 0x71D67FFFEDA60000ULL;
x ^= (x << 37) & 0xFFF7EEE000000000ULL;
x ^= (x >> 43);
return x;
}
/* generates a random number on [0, 2^63-1]-interval */
long long genrand64_int63(void)
{
return (long long)(genrand64_int64() >> 1);
}
/* generates a random number on [0,1]-real-interval */
double genrand64_real1(void)
{
return (genrand64_int64() >> 11) * (1.0/9007199254740991.0);
}
/* generates a random number on [0,1)-real-interval */
double genrand64_real2(void)
{
return (genrand64_int64() >> 11) * (1.0/9007199254740992.0);
}
/* generates a random number on (0,1)-real-interval */
double genrand64_real3(void)
{
return ((genrand64_int64() >> 12) + 0.5) * (1.0/4503599627370496.0);
}
#ifdef MT19937_64_MAIN
int main(void)
{
int i;
unsigned long long init[4]={0x12345ULL, 0x23456ULL, 0x34567ULL, 0x45678ULL}, length=4;
init_by_array64(init, length);
printf("1000 outputs of genrand64_int64()\n");
for (i=0; i<1000; i++) {
printf("%20llu ", genrand64_int64());
if (i%5==4) printf("\n");
}
printf("\n1000 outputs of genrand64_real2()\n");
for (i=0; i<1000; i++) {
printf("%10.8f ", genrand64_real2());
if (i%5==4) printf("\n");
}
return 0;
}
#endif

87
src/mt19937-64.h Normal file
View File

@ -0,0 +1,87 @@
/*
A C-program for MT19937-64 (2004/9/29 version).
Coded by Takuji Nishimura and Makoto Matsumoto.
This is a 64-bit version of Mersenne Twister pseudorandom number
generator.
Before using, initialize the state by using init_genrand64(seed)
or init_by_array64(init_key, key_length).
Copyright (C) 2004, Makoto Matsumoto and Takuji Nishimura,
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. The names of its contributors may not be used to endorse or promote
products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
References:
T. Nishimura, ``Tables of 64-bit Mersenne Twisters''
ACM Transactions on Modeling and
Computer Simulation 10. (2000) 348--357.
M. Matsumoto and T. Nishimura,
``Mersenne Twister: a 623-dimensionally equidistributed
uniform pseudorandom number generator''
ACM Transactions on Modeling and
Computer Simulation 8. (Jan. 1998) 3--30.
Any feedback is very welcome.
http://www.math.hiroshima-u.ac.jp/~m-mat/MT/emt.html
email: m-mat @ math.sci.hiroshima-u.ac.jp (remove spaces)
*/
#ifndef __MT19937_64_H
#define __MT19937_64_H
/* initializes mt[NN] with a seed */
void init_genrand64(unsigned long long seed);
/* initialize by an array with array-length */
/* init_key is the array for initializing keys */
/* key_length is its length */
void init_by_array64(unsigned long long init_key[],
unsigned long long key_length);
/* generates a random number on [0, 2^64-1]-interval */
unsigned long long genrand64_int64(void);
/* generates a random number on [0, 2^63-1]-interval */
long long genrand64_int63(void);
/* generates a random number on [0,1]-real-interval */
double genrand64_real1(void);
/* generates a random number on [0,1)-real-interval */
double genrand64_real2(void);
/* generates a random number on (0,1)-real-interval */
double genrand64_real3(void);
/* generates a random number on (0,1]-real-interval */
double genrand64_real4(void);
#endif

View File

@ -59,6 +59,7 @@
#include "crc16_slottable.h"
#include "hdr_histogram.h"
#include "cli_common.h"
#include "mt19937-64.h"
#define UNUSED(V) ((void) V)
#define RANDPTR_INITIAL_SIZE 8
@ -1677,6 +1678,7 @@ int main(int argc, const char **argv) {
client c;
srandom(time(NULL) ^ getpid());
init_genrand64(ustime() ^ getpid());
signal(SIGHUP, SIG_IGN);
signal(SIGPIPE, SIG_IGN);

View File

@ -27,10 +27,13 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "mt19937-64.h"
#include "server.h"
#include "rdb.h"
#include <stdarg.h>
#include <sys/time.h>
#include <unistd.h>
void createSharedObjects(void);
void rdbLoadProgressCallback(rio *r, const void *buf, size_t len);
@ -362,10 +365,16 @@ err:
* Otherwise if called with a non NULL fp, the function returns C_OK or
* C_ERR depending on the success or failure. */
int redis_check_rdb_main(int argc, char **argv, FILE *fp) {
struct timeval tv;
if (argc != 2 && fp == NULL) {
fprintf(stderr, "Usage: %s <rdb-file-name>\n", argv[0]);
exit(1);
}
gettimeofday(&tv, NULL);
init_genrand64(((long long) tv.tv_sec * 1000000 + tv.tv_usec) ^ getpid());
/* In order to call the loading functions we need to create the shared
* integer objects, however since this function may be called from
* an already initialized Redis instance, check if we really need to. */

View File

@ -62,6 +62,7 @@
#include "anet.h"
#include "ae.h"
#include "cli_common.h"
#include "mt19937-64.h"
#define UNUSED(V) ((void) V)
@ -8123,6 +8124,7 @@ static sds askPassword(const char *msg) {
int main(int argc, char **argv) {
int firstarg;
struct timeval tv;
config.hostip = sdsnew("127.0.0.1");
config.hostport = 6379;
@ -8219,6 +8221,9 @@ int main(int argc, char **argv) {
}
#endif
gettimeofday(&tv, NULL);
init_genrand64(((long long) tv.tv_sec * 1000000 + tv.tv_usec) ^ getpid());
/* Cluster Manager mode */
if (CLUSTER_MANAGER_MODE()) {
clusterManagerCommandProc *proc = validateClusterManagerCommand();

View File

@ -34,6 +34,7 @@
#include "bio.h"
#include "latency.h"
#include "atomicvar.h"
#include "mt19937-64.h"
#include <time.h>
#include <signal.h>
@ -5452,6 +5453,7 @@ int main(int argc, char **argv) {
srand(time(NULL)^getpid());
srandom(time(NULL)^getpid());
gettimeofday(&tv,NULL);
init_genrand64(((long long) tv.tv_sec * 1000000 + tv.tv_usec) ^ getpid());
crc64_init();
uint8_t hashseed[16];