Added INFO LATENCYSTATS section: latency by percentile distribution/latency by cumulative distribution of latencies (#9462)

# Short description

The Redis extended latency stats track per command latencies and enables:
- exporting the per-command percentile distribution via the `INFO LATENCYSTATS` command.
  **( percentile distribution is not mergeable between cluster nodes ).**
- exporting the per-command cumulative latency distributions via the `LATENCY HISTOGRAM` command.
  Using the cumulative distribution of latencies we can merge several stats from different cluster nodes
  to calculate aggregate metrics .

By default, the extended latency monitoring is enabled since the overhead of keeping track of the
command latency is very small.
 
If you don't want to track extended latency metrics, you can easily disable it at runtime using the command:
 - `CONFIG SET latency-tracking no`

By default, the exported latency percentiles are the p50, p99, and p999.
You can alter them at runtime using the command:
- `CONFIG SET latency-tracking-info-percentiles "0.0 50.0 100.0"`


## Some details:
- The total size per histogram should sit around 40 KiB. We only allocate those 40KiB when a command
  was called for the first time.
- With regards to the WRITE overhead As seen below, there is no measurable overhead on the achievable
  ops/sec or full latency spectrum on the client. Including also the measured redis-benchmark for unstable
  vs this branch. 
- We track from 1 nanosecond to 1 second ( everything above 1 second is considered +Inf )

## `INFO LATENCYSTATS` exposition format

   - Format: `latency_percentiles_usec_<CMDNAME>:p0=XX,p50....` 

## `LATENCY HISTOGRAM [command ...]` exposition format

Return a cumulative distribution of latencies in the format of a histogram for the specified command names.

The histogram is composed of a map of time buckets:
- Each representing a latency range, between 1 nanosecond and roughly 1 second.
- Each bucket covers twice the previous bucket's range.
- Empty buckets are not printed.
- Everything above 1 sec is considered +Inf.
- At max there will be log2(1000000000)=30 buckets

We reply a map for each command in the format:
`<command name> : { `calls`: <total command calls> , `histogram` : { <bucket 1> : latency , < bucket 2> : latency, ...  } }`

Co-authored-by: Oran Agra <oran@redislabs.com>
This commit is contained in:
filipe oliveira 2022-01-05 12:01:05 +00:00 committed by GitHub
parent 4d3c4cfac7
commit 5dd15443ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 568 additions and 6 deletions

1
.gitignore vendored
View File

@ -28,6 +28,7 @@ src/nodes.conf
deps/lua/src/lua
deps/lua/src/luac
deps/lua/src/liblua.a
deps/hdr_histogram/libhdrhistogram.a
tests/tls/*
.make-*
.prerequisites

View File

@ -9,12 +9,21 @@ DEBUG= -g
R_CC=$(CC) $(R_CFLAGS)
R_LD=$(CC) $(R_LDFLAGS)
hdr_histogram.o: hdr_histogram.h hdr_histogram.c
AR= ar
ARFLAGS= rcs
libhdrhistogram.a: hdr_histogram.o hdr_alloc.o
$(AR) $(ARFLAGS) $@ $+
hdr_alloc.o: hdr_alloc.h hdr_alloc.c
hdr_histogram.o: hdr_alloc.o hdr_histogram.h hdr_histogram.c
.c.o:
$(R_CC) -c $<
clean:
rm -f *.o
rm -f *.a

34
deps/hdr_histogram/hdr_alloc.c vendored Normal file
View File

@ -0,0 +1,34 @@
/**
* hdr_alloc.c
* Written by Filipe Oliveira and released to the public domain,
* as explained at http://creativecommons.org/publicdomain/zero/1.0/
*/
#include "hdr_alloc.h"
#include <stdlib.h>
hdrAllocFuncs hdrAllocFns = {
.mallocFn = malloc,
.callocFn = calloc,
.reallocFn = realloc,
.freeFn = free,
};
/* Override hdr' allocators with ones supplied by the user */
hdrAllocFuncs hdrSetAllocators(hdrAllocFuncs *override) {
hdrAllocFuncs orig = hdrAllocFns;
hdrAllocFns = *override;
return orig;
}
/* Reset allocators to use build time defaults */
void hdrResetAllocators(void) {
hdrAllocFns = (hdrAllocFuncs){
.mallocFn = malloc,
.callocFn = calloc,
.reallocFn = realloc,
.freeFn = free,
};
}

47
deps/hdr_histogram/hdr_alloc.h vendored Normal file
View File

@ -0,0 +1,47 @@
/**
* hdr_alloc.h
* Written by Filipe Oliveira and released to the public domain,
* as explained at http://creativecommons.org/publicdomain/zero/1.0/
*
* Allocator selection.
*
* This file is used in order to change the HdrHistogram allocator at run
* time. */
#ifndef HDR_ALLOC_H
#define HDR_ALLOC_H
#include <stddef.h> /* for size_t */
#include <stdint.h>
/* Structure pointing to our actually configured allocators */
typedef struct hdrAllocFuncs {
void *(*mallocFn)(size_t);
void *(*callocFn)(size_t, size_t);
void *(*reallocFn)(void *, size_t);
void (*freeFn)(void *);
} hdrAllocFuncs;
/* hdr' configured allocator function pointer struct */
extern hdrAllocFuncs hdrAllocFns;
hdrAllocFuncs hdrSetAllocators(hdrAllocFuncs *ha);
void hdrResetAllocators(void);
static inline void *hdr_malloc(size_t size) {
return hdrAllocFns.mallocFn(size);
}
static inline void *hdr_calloc(size_t nmemb, size_t size) {
return hdrAllocFns.callocFn(nmemb, size);
}
static inline void *hdr_realloc(void *ptr, size_t size) {
return hdrAllocFns.reallocFn(ptr, size);
}
static inline void hdr_free(void *ptr) {
hdrAllocFns.freeFn(ptr);
}
#endif /* HDR_ALLOC_H */

View File

@ -15,6 +15,12 @@
#include "hdr_histogram.h"
#include "hdr_atomic.h"
#include "hdr_alloc.h"
#define malloc hdr_malloc
#define calloc hdr_calloc
#define free hdr_free
#define realloc hdr_realloc
/* ###### ####### ## ## ## ## ######## ###### */
/* ## ## ## ## ## ## ### ## ## ## ## */

View File

@ -13,7 +13,6 @@
#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
struct hdr_histogram
{
int64_t lowest_trackable_value;

View File

@ -1769,6 +1769,20 @@ slowlog-max-len 128
# "CONFIG SET latency-monitor-threshold <milliseconds>" if needed.
latency-monitor-threshold 0
################################ LATENCY TRACKING ##############################
# The Redis extended latency monitoring tracks the per command latencies and enables
# exporting the percentile distribution via the INFO latencystats command,
# and cumulative latency distributions (histograms) via the LATENCY command.
#
# By default, the extended latency monitoring is enabled since the overhead
# of keeping track of the command latency is very small.
# latency-tracking yes
# By default the exported latency percentiles via the INFO latencystats command
# are the p50, p99, and p999.
# latency-tracking-info-percentiles 50.0 99.0 99.9
############################# EVENT NOTIFICATION ##############################
# Redis can notify Pub/Sub clients about events happening in the key space.

View File

@ -365,7 +365,7 @@ endif
# redis-server
$(REDIS_SERVER_NAME): $(REDIS_SERVER_OBJ)
$(REDIS_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/lua/src/liblua.a $(FINAL_LIBS)
$(REDIS_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/lua/src/liblua.a ../deps/hdr_histogram/libhdrhistogram.a $(FINAL_LIBS)
# redis-sentinel
$(REDIS_SENTINEL_NAME): $(REDIS_SERVER_NAME)
@ -385,7 +385,7 @@ $(REDIS_CLI_NAME): $(REDIS_CLI_OBJ)
# redis-benchmark
$(REDIS_BENCHMARK_NAME): $(REDIS_BENCHMARK_OBJ)
$(REDIS_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/hdr_histogram/hdr_histogram.o $(FINAL_LIBS)
$(REDIS_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/hdr_histogram/libhdrhistogram.a $(FINAL_LIBS)
DEP = $(REDIS_SERVER_OBJ:%.o=%.d) $(REDIS_CLI_OBJ:%.o=%.d) $(REDIS_BENCHMARK_OBJ:%.o=%.d)
-include $(DEP)

View File

@ -111,7 +111,8 @@ void blockClient(client *c, int btype) {
void updateStatsOnUnblock(client *c, long blocked_us, long reply_us){
const ustime_t total_cmd_duration = c->duration + blocked_us + reply_us;
c->lastcmd->microseconds += total_cmd_duration;
if (server.latency_tracking_enabled)
updateCommandLatencyHistogram(&(c->lastcmd->latency_histogram), total_cmd_duration*1000);
/* Log the command into the Slow log if needed. */
slowlogPushCurrentCommand(c, c->lastcmd, total_cmd_duration);
/* Log the reply duration event. */

View File

@ -4105,6 +4105,20 @@ struct redisCommandArg LATENCY_GRAPH_Args[] = {
/* LATENCY HELP hints */
#define LATENCY_HELP_Hints NULL
/********** LATENCY HISTOGRAM ********************/
/* LATENCY HISTOGRAM history */
#define LATENCY_HISTOGRAM_History NULL
/* LATENCY HISTOGRAM hints */
#define LATENCY_HISTOGRAM_Hints NULL
/* LATENCY HISTOGRAM argument table */
struct redisCommandArg LATENCY_HISTOGRAM_Args[] = {
{"command",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL|CMD_ARG_MULTIPLE},
{0}
};
/********** LATENCY HISTORY ********************/
/* LATENCY HISTORY history */
@ -4146,6 +4160,7 @@ struct redisCommand LATENCY_Subcommands[] = {
{"doctor","Return a human readable latency analysis report.","O(1)","2.8.13",CMD_DOC_NONE,NULL,NULL,COMMAND_GROUP_SERVER,LATENCY_DOCTOR_History,LATENCY_DOCTOR_Hints,latencyCommand,2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,0},
{"graph","Return a latency graph for the event.","O(1)","2.8.13",CMD_DOC_NONE,NULL,NULL,COMMAND_GROUP_SERVER,LATENCY_GRAPH_History,LATENCY_GRAPH_Hints,latencyCommand,3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,0,.args=LATENCY_GRAPH_Args},
{"help","Show helpful text about the different subcommands.","O(1)","2.8.13",CMD_DOC_NONE,NULL,NULL,COMMAND_GROUP_SERVER,LATENCY_HELP_History,LATENCY_HELP_Hints,latencyCommand,2,CMD_LOADING|CMD_STALE,0},
{"histogram","Return the cumulative distribution of latencies of a subset of commands or all.","O(N) where N is the number of commands with latency information being retrieved.","7.0.0",CMD_DOC_NONE,NULL,NULL,COMMAND_GROUP_SERVER,LATENCY_HISTOGRAM_History,LATENCY_HISTOGRAM_Hints,latencyCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,0,.args=LATENCY_HISTOGRAM_Args},
{"history","Return timestamp-latency samples for the event.","O(1)","2.8.13",CMD_DOC_NONE,NULL,NULL,COMMAND_GROUP_SERVER,LATENCY_HISTORY_History,LATENCY_HISTORY_Hints,latencyCommand,3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,0,.args=LATENCY_HISTORY_Args},
{"latest","Return the latest latency samples for all events.","O(1)","2.8.13",CMD_DOC_NONE,NULL,NULL,COMMAND_GROUP_SERVER,LATENCY_LATEST_History,LATENCY_LATEST_Hints,latencyCommand,2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,0},
{"reset","Reset latency data for one or more events.","O(1)","2.8.13",CMD_DOC_NONE,NULL,NULL,COMMAND_GROUP_SERVER,LATENCY_RESET_History,LATENCY_RESET_Hints,latencyCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,0,.args=LATENCY_RESET_Args},

View File

@ -0,0 +1,25 @@
{
"HISTOGRAM": {
"summary": "Return the cumulative distribution of latencies of a subset of commands or all.",
"complexity": "O(N) where N is the number of commands with latency information being retrieved.",
"group": "server",
"since": "7.0.0",
"arity": -2,
"container": "LATENCY",
"function": "latencyCommand",
"command_flags": [
"ADMIN",
"NOSCRIPT",
"LOADING",
"STALE"
],
"arguments": [
{
"name": "COMMAND",
"type": "string",
"optional": true,
"multiple": true
}
]
}
}

View File

@ -2660,6 +2660,71 @@ int allowProtectedAction(int config, client *c) {
(config == PROTECTED_ACTION_ALLOWED_LOCAL && islocalClient(c));
}
static int setConfigLatencyTrackingInfoPercentilesOutputOption(typeData data, sds *argv, int argc, const char **err) {
UNUSED(data);
zfree(server.latency_tracking_info_percentiles);
server.latency_tracking_info_percentiles = NULL;
server.latency_tracking_info_percentiles_len = argc;
/* Special case: treat single arg "" as zero args indicating empty percentile configuration */
if (argc == 1 && sdslen(argv[0]) == 0)
server.latency_tracking_info_percentiles_len = 0;
else
server.latency_tracking_info_percentiles = zmalloc(sizeof(double)*argc);
for (int j = 0; j < server.latency_tracking_info_percentiles_len; j++) {
double percentile;
if (!string2d(argv[j], sdslen(argv[j]), &percentile)) {
*err = "Invalid latency-tracking-info-percentiles parameters";
goto configerr;
}
if (percentile > 100.0 || percentile < 0.0) {
*err = "latency-tracking-info-percentiles parameters should sit between [0.0,100.0]";
goto configerr;
}
server.latency_tracking_info_percentiles[j] = percentile;
}
return 1;
configerr:
zfree(server.latency_tracking_info_percentiles);
server.latency_tracking_info_percentiles = NULL;
server.latency_tracking_info_percentiles_len = 0;
return 0;
}
static sds getConfigLatencyTrackingInfoPercentilesOutputOption(typeData data) {
UNUSED(data);
sds buf = sdsempty();
for (int j = 0; j < server.latency_tracking_info_percentiles_len; j++) {
buf = sdscatprintf(buf,"%f",
server.latency_tracking_info_percentiles[j]);
if (j != server.latency_tracking_info_percentiles_len-1)
buf = sdscatlen(buf," ",1);
}
return buf;
}
/* Rewrite the latency-tracking-info-percentiles option. */
void rewriteConfigLatencyTrackingInfoPercentilesOutputOption(typeData data, const char *name, struct rewriteConfigState *state) {
UNUSED(data);
sds line = sdsnew(name);
/* Rewrite latency-tracking-info-percentiles parameters,
* or an empty 'latency-tracking-info-percentiles ""' line to avoid the
* defaults from being used.
*/
if (!server.latency_tracking_info_percentiles_len) {
line = sdscat(line," \"\"");
} else {
for (int j = 0; j < server.latency_tracking_info_percentiles_len; j++) {
line = sdscatprintf(line," %f",
server.latency_tracking_info_percentiles[j]);
}
}
rewriteConfigRewriteLine(state,name,line,1);
}
standardConfig configs[] = {
/* Bool configs */
createBoolConfig("rdbchecksum", NULL, IMMUTABLE_CONFIG, server.rdb_checksum, 1, NULL, NULL),
@ -2705,6 +2770,7 @@ standardConfig configs[] = {
createBoolConfig("disable-thp", NULL, MODIFIABLE_CONFIG, server.disable_thp, 1, NULL, NULL),
createBoolConfig("cluster-allow-replica-migration", NULL, MODIFIABLE_CONFIG, server.cluster_allow_replica_migration, 1, NULL, NULL),
createBoolConfig("replica-announced", NULL, MODIFIABLE_CONFIG, server.replica_announced, 1, NULL, NULL),
createBoolConfig("latency-tracking", NULL, MODIFIABLE_CONFIG, server.latency_tracking_enabled, 1, NULL, NULL),
createBoolConfig("aof-disable-auto-gc", NULL, MODIFIABLE_CONFIG, server.aof_disable_auto_gc, 0, NULL, updateAofAutoGCEnabled),
/* String Configs */
@ -2857,6 +2923,7 @@ standardConfig configs[] = {
createSpecialConfig("notify-keyspace-events", NULL, MODIFIABLE_CONFIG, setConfigNotifyKeyspaceEventsOption, getConfigNotifyKeyspaceEventsOption, rewriteConfigNotifyKeyspaceEventsOption, NULL),
createSpecialConfig("bind", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigBindOption, getConfigBindOption, rewriteConfigBindOption, applyBind),
createSpecialConfig("replicaof", "slaveof", IMMUTABLE_CONFIG | MULTI_ARG_CONFIG, setConfigReplicaOfOption, getConfigReplicaOfOption, rewriteConfigReplicaOfOption, NULL),
createSpecialConfig("latency-tracking-info-percentiles", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigLatencyTrackingInfoPercentilesOutputOption, getConfigLatencyTrackingInfoPercentilesOutputOption, rewriteConfigLatencyTrackingInfoPercentilesOutputOption, NULL),
/* NULL Terminator */
{NULL}

View File

@ -490,6 +490,78 @@ sds createLatencyReport(void) {
/* ---------------------- Latency command implementation -------------------- */
/* latencyCommand() helper to produce a map of time buckets,
* each representing a latency range,
* between 1 nanosecond and roughly 1 second.
* Each bucket covers twice the previous bucket's range.
* Empty buckets are not printed.
* Everything above 1 sec is considered +Inf.
* At max there will be log2(1000000000)=30 buckets */
void fillCommandCDF(client *c, struct hdr_histogram* histogram) {
addReplyMapLen(c,2);
addReplyBulkCString(c,"calls");
addReplyLongLong(c,(long long) histogram->total_count);
addReplyBulkCString(c,"histogram_usec");
void *replylen = addReplyDeferredLen(c);
int samples = 0;
struct hdr_iter iter;
hdr_iter_log_init(&iter,histogram,1024,2);
int64_t previous_count = 0;
while (hdr_iter_next(&iter)) {
const int64_t micros = iter.highest_equivalent_value / 1000;
const int64_t cumulative_count = iter.cumulative_count;
if(cumulative_count > previous_count){
addReplyLongLong(c,(long long) micros);
addReplyLongLong(c,(long long) cumulative_count);
samples++;
}
previous_count = cumulative_count;
}
setDeferredMapLen(c,replylen,samples);
}
/* latencyCommand() helper to produce for all commands,
* a per command cumulative distribution of latencies. */
void latencyAllCommandsFillCDF(client *c) {
dictIterator *di = dictGetSafeIterator(server.commands);
dictEntry *de;
struct redisCommand *cmd;
void *replylen = addReplyDeferredLen(c);
int command_with_data = 0;
while((de = dictNext(di)) != NULL) {
cmd = (struct redisCommand *) dictGetVal(de);
if (!cmd->latency_histogram)
continue;
addReplyBulkCString(c,cmd->name);
fillCommandCDF(c, cmd->latency_histogram);
command_with_data++;
}
dictReleaseIterator(di);
setDeferredMapLen(c,replylen,command_with_data);
}
/* latencyCommand() helper to produce for a specific command set,
* a per command cumulative distribution of latencies. */
void latencySpecificCommandsFillCDF(client *c) {
void *replylen = addReplyDeferredLen(c);
int command_with_data = 0;
for (int j = 2; j < c->argc; j++){
struct redisCommand *cmd = dictFetchValue(server.commands, c->argv[j]->ptr);
/* If the command does not exist we skip the reply */
if (cmd == NULL) {
continue;
}
/* If no latency info we reply with the same format as non empty histograms */
if (!cmd->latency_histogram) {
continue;
}
addReplyBulkCString(c,c->argv[j]->ptr);
fillCommandCDF(c, cmd->latency_histogram);
command_with_data++;
}
setDeferredMapLen(c,replylen,command_with_data);
}
/* latencyCommand() helper to produce a time-delay reply for all the samples
* in memory for the specified time series. */
void latencyCommandReplyWithSamples(client *c, struct latencyTimeSeries *ts) {
@ -582,6 +654,7 @@ sds latencyCommandGenSparkeline(char *event, struct latencyTimeSeries *ts) {
* LATENCY DOCTOR: returns a human readable analysis of instance latency.
* LATENCY GRAPH: provide an ASCII graph of the latency of the specified event.
* LATENCY RESET: reset data of a specified event or all the data if no event provided.
* LATENCY HISTOGRAM: return a cumulative distribution of latencies in the format of an histogram for the specified command names.
*/
void latencyCommand(client *c) {
struct latencyTimeSeries *ts;
@ -628,6 +701,13 @@ void latencyCommand(client *c) {
resets += latencyResetEvent(c->argv[j]->ptr);
addReplyLongLong(c,resets);
}
} else if (!strcasecmp(c->argv[1]->ptr,"histogram") && c->argc >= 2) {
/* LATENCY HISTOGRAM*/
if (c->argc == 2) {
latencyAllCommandsFillCDF(c);
} else {
latencySpecificCommandsFillCDF(c);
}
} else if (!strcasecmp(c->argv[1]->ptr,"help") && c->argc == 2) {
const char *help[] = {
"DOCTOR",
@ -641,6 +721,9 @@ void latencyCommand(client *c) {
"RESET [<event> ...]",
" Reset latency data of one or more <event> classes.",
" (default: reset all data for all event classes)",
"HISTOGRAM [COMMAND ...]",
" Return a cumulative distribution of latencies in the format of a histogram for the specified command names.",
" If no commands are specified then all histograms are replied.",
NULL
};
addReplyHelp(c, help);

View File

@ -9589,6 +9589,10 @@ void moduleUnregisterCommands(struct RedisModule *module) {
sdsfree((sds)cmd->summary);
sdsfree((sds)cmd->since);
sdsfree((sds)cmd->complexity);
if (cmd->latency_histogram) {
hdr_close(cmd->latency_histogram);
cmd->latency_histogram = NULL;
}
zfree(cmd->args);
zfree(cmd);
zfree(cp);

View File

@ -36,6 +36,7 @@
#include "atomicvar.h"
#include "mt19937-64.h"
#include "functions.h"
#include "hdr_alloc.h"
#include <time.h>
#include <signal.h>
@ -1781,6 +1782,13 @@ void initServerConfig(void) {
server.page_size = sysconf(_SC_PAGESIZE);
server.pause_cron = 0;
server.latency_tracking_enabled = 1;
server.latency_tracking_info_percentiles_len = 3;
server.latency_tracking_info_percentiles = zmalloc(sizeof(double)*(server.latency_tracking_info_percentiles_len));
server.latency_tracking_info_percentiles[0] = 50.0; /* p50 */
server.latency_tracking_info_percentiles[1] = 99.0; /* p99 */
server.latency_tracking_info_percentiles[2] = 99.9; /* p999 */
unsigned int lruclock = getLRUClock();
atomicSet(server.lruclock,lruclock);
resetServerSaveParams();
@ -1789,6 +1797,15 @@ void initServerConfig(void) {
appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
/* Specify the allocation function for the hdr histogram */
hdrAllocFuncs hdrallocfn = {
.mallocFn = zmalloc,
.callocFn = zcalloc_num,
.reallocFn = zrealloc,
.freeFn = zfree,
};
hdrSetAllocators(&hdrallocfn);
/* Replication related */
server.masterhost = NULL;
server.masterport = 6379;
@ -2610,6 +2627,10 @@ void populateCommandStructure(struct redisCommand *c) {
c->key_specs = c->key_specs_static;
c->key_specs_max = STATIC_KEY_SPECS_NUM;
/* We start with an unallocated histogram and only allocate memory when a command
* has been issued for the first time */
c->latency_histogram = NULL;
for (int i = 0; i < STATIC_KEY_SPECS_NUM; i++) {
if (c->key_specs[i].begin_search_type == KSPEC_BS_INVALID)
break;
@ -2683,6 +2704,10 @@ void resetCommandTableStats(dict* commands) {
c->calls = 0;
c->rejected_calls = 0;
c->failed_calls = 0;
if(c->latency_histogram) {
hdr_close(c->latency_histogram);
c->latency_histogram = NULL;
}
if (c->subcommands_dict)
resetCommandTableStats(c->subcommands_dict);
}
@ -2918,6 +2943,19 @@ void slowlogPushCurrentCommand(client *c, struct redisCommand *cmd, ustime_t dur
slowlogPushEntryIfNeeded(c,argv,argc,duration);
}
/* This function is called in order to update the total command histogram duration.
* The latency unit is nano-seconds.
* If needed it will allocate the histogram memory and trim the duration to the upper/lower tracking limits*/
void updateCommandLatencyHistogram(struct hdr_histogram **latency_histogram, int64_t duration_hist){
if (duration_hist < LATENCY_HISTOGRAM_MIN_VALUE)
duration_hist=LATENCY_HISTOGRAM_MIN_VALUE;
if (duration_hist>LATENCY_HISTOGRAM_MAX_VALUE)
duration_hist=LATENCY_HISTOGRAM_MAX_VALUE;
if (*latency_histogram==NULL)
hdr_init(LATENCY_HISTOGRAM_MIN_VALUE,LATENCY_HISTOGRAM_MAX_VALUE,LATENCY_HISTOGRAM_PRECISION,latency_histogram);
hdr_record_value(*latency_histogram,duration_hist);
}
/* Handle the alsoPropagate() API to handle commands that want to propagate
* multiple separated commands. Note that alsoPropagate() is not affected
* by CLIENT_PREVENT_PROP flag. */
@ -3104,6 +3142,9 @@ void call(client *c, int flags) {
if (flags & CMD_CALL_STATS) {
real_cmd->microseconds += duration;
real_cmd->calls++;
/* If the client is blocked we will handle latency stats when it is unblocked. */
if (server.latency_tracking_enabled && !(c->flags & CLIENT_BLOCKED))
updateCommandLatencyHistogram(&(real_cmd->latency_histogram), duration*1000);
}
/* Propagate the command into the AOF and replication link.
@ -4573,6 +4614,19 @@ void bytesToHuman(char *s, unsigned long long n) {
}
}
/* Fill percentile distribution of latencies. */
sds fillPercentileDistributionLatencies(sds info, const char* histogram_name, struct hdr_histogram* histogram) {
info = sdscatfmt(info,"latency_percentiles_usec_%s:",histogram_name);
for (int j = 0; j < server.latency_tracking_info_percentiles_len; j++) {
info = sdscatprintf(info,"p%f=%.3f", server.latency_tracking_info_percentiles[j],
((double)hdr_value_at_percentile(histogram,server.latency_tracking_info_percentiles[j]))/1000.0f);
if (j != server.latency_tracking_info_percentiles_len-1)
info = sdscatlen(info,",",1);
}
info = sdscatprintf(info,"\r\n");
return info;
}
sds getFullCommandName(struct redisCommand *cmd) {
if (!cmd->parent) {
return sdsnew(cmd->name);
@ -5346,6 +5400,27 @@ sds genRedisInfoString(const char *section) {
raxStop(&ri);
}
/* Latency by percentile distribution per command */
if (allsections || !strcasecmp(section,"latencystats")) {
if (sections++) info = sdscat(info,"\r\n");
info = sdscatprintf(info, "# Latencystats\r\n");
if (server.latency_tracking_enabled) {
struct redisCommand *c;
dictEntry *de;
dictIterator *di;
di = dictGetSafeIterator(server.commands);
while((de = dictNext(di)) != NULL) {
char *tmpsafe;
c = (struct redisCommand *) dictGetVal(de);
if (!c->latency_histogram)
continue;
info = fillPercentileDistributionLatencies(info,getSafeInfoString(c->name, strlen(c->name), &tmpsafe),c->latency_histogram);
if (tmpsafe != NULL) zfree(tmpsafe);
}
dictReleaseIterator(di);
}
}
/* Cluster */
if (allsections || defsections || !strcasecmp(section,"cluster")) {
if (sections++) info = sdscat(info,"\r\n");

View File

@ -50,6 +50,7 @@
#include <sys/socket.h>
#include <lua.h>
#include <signal.h>
#include "hdr_histogram.h"
#ifdef HAVE_LIBSYSTEMD
#include <systemd/sd-daemon.h>
@ -571,6 +572,13 @@ typedef enum {
#define serverAssert(_e) ((_e)?(void)0 : (_serverAssert(#_e,__FILE__,__LINE__),redis_unreachable()))
#define serverPanic(...) _serverPanic(__FILE__,__LINE__,__VA_ARGS__),redis_unreachable()
/* latency histogram per command init settings */
#define LATENCY_HISTOGRAM_MIN_VALUE 1L /* >= 1 nanosec */
#define LATENCY_HISTOGRAM_MAX_VALUE 1000000000L /* <= 1 secs */
#define LATENCY_HISTOGRAM_PRECISION 2 /* Maintain a value precision of 2 significant digits across LATENCY_HISTOGRAM_MIN_VALUE and LATENCY_HISTOGRAM_MAX_VALUE range.
* Value quantization within the range will thus be no larger than 1/100th (or 1%) of any value.
* The total size per histogram should sit around 40 KiB Bytes. */
/*-----------------------------------------------------------------------------
* Data types
*----------------------------------------------------------------------------*/
@ -1583,6 +1591,9 @@ struct redisServer {
char *proc_title_template; /* Process title template format */
clientBufferLimitsConfig client_obuf_limits[CLIENT_TYPE_OBUF_COUNT];
int pause_cron; /* Don't run cron tasks (debug) */
int latency_tracking_enabled; /* 1 if extended latency tracking is enabled, 0 otherwise. */
double *latency_tracking_info_percentiles; /* Extended latency tracking info output percentile list configuration. */
int latency_tracking_info_percentiles_len;
/* AOF persistence */
int aof_enabled; /* AOF configuration */
int aof_state; /* AOF_(ON|OFF|WAIT_REWRITE) */
@ -2175,6 +2186,7 @@ struct redisCommand {
ACLs. A connection is able to execute a given command if
the user associated to the connection has this command
bit set in the bitmap of allowed commands. */
struct hdr_histogram* latency_histogram; /*points to the command latency command histogram (unit of time nanosecond) */
keySpec *key_specs;
keySpec legacy_range_key_spec; /* The legacy (first,last,step) key spec is
* still maintained (if applicable) so that
@ -2782,6 +2794,7 @@ void preventCommandPropagation(client *c);
void preventCommandAOF(client *c);
void preventCommandReplication(client *c);
void slowlogPushCurrentCommand(client *c, struct redisCommand *cmd, ustime_t duration);
void updateCommandLatencyHistogram(struct hdr_histogram** latency_histogram, int64_t duration_hist);
int prepareForShutdown(int flags);
void replyToClientsBlockedOnShutdown(void);
int abortShutdown(void);

View File

@ -181,6 +181,20 @@ void *ztrycalloc_usable(size_t size, size_t *usable) {
#endif
}
/* Allocate memory and zero it or panic.
* We need this wrapper to have a calloc compatible signature */
void *zcalloc_num(size_t num, size_t size) {
/* Ensure that the arguments to calloc(), when multiplied, do not wrap.
* Division operations are susceptible to divide-by-zero errors so we also check it. */
if ((size == 0) || (num > SIZE_MAX/size)) {
zmalloc_oom_handler(SIZE_MAX);
return NULL;
}
void *ptr = ztrycalloc_usable(num*size, NULL);
if (!ptr) zmalloc_oom_handler(num*size);
return ptr;
}
/* Allocate memory and zero it or panic */
void *zcalloc(size_t size) {
void *ptr = ztrycalloc_usable(size, NULL);

View File

@ -98,6 +98,7 @@
void *zmalloc(size_t size);
void *zcalloc(size_t size);
void *zcalloc_num(size_t num, size_t size);
void *zrealloc(void *ptr, size_t size);
void *ztrymalloc(size_t size);
void *ztrycalloc(size_t size);

View File

@ -631,6 +631,12 @@ proc errorrstat {cmd r} {
}
}
proc latencyrstat_percentiles {cmd r} {
if {[regexp "\r\nlatency_percentiles_usec_$cmd:(.*?)\r\n" [$r info latencystats] _ value]} {
set _ $value
}
}
proc generate_fuzzy_traffic_on_key {key duration} {
# Commands per type, blocking commands removed
# TODO: extract these from help.h or elsewhere, and improve to include other types

View File

@ -6,9 +6,90 @@ proc errorstat {cmd} {
return [errorrstat $cmd r]
}
proc latency_percentiles_usec {cmd} {
return [latencyrstat_percentiles $cmd r]
}
start_server {tags {"info" "external:skip"}} {
start_server {} {
test {latencystats: disable/enable} {
r config resetstat
r CONFIG SET latency-tracking no
r set a b
assert_match {} [latency_percentiles_usec set]
r CONFIG SET latency-tracking yes
r set a b
assert_match {*p50.000000=*,p99.000000=*,p99.900000=*} [latency_percentiles_usec set]
r config resetstat
assert_match {} [latency_percentiles_usec set]
}
test {latencystats: configure percentiles} {
r config resetstat
assert_match {} [latency_percentiles_usec set]
r CONFIG SET latency-tracking yes
r SET a b
r GET a
assert_match {*p50.000000=*,p99.000000=*,p99.900000=*} [latency_percentiles_usec set]
assert_match {*p50.000000=*,p99.000000=*,p99.900000=*} [latency_percentiles_usec get]
r CONFIG SET latency-tracking-info-percentiles "0.0 50.0 100.0"
assert_match [r config get latency-tracking-info-percentiles] {latency-tracking-info-percentiles {0.000000 50.000000 100.000000}}
assert_match {*p0.000000=*,p50.000000=*,p100.000000=*} [latency_percentiles_usec set]
assert_match {*p0.000000=*,p50.000000=*,p100.000000=*} [latency_percentiles_usec get]
r config resetstat
assert_match {} [latency_percentiles_usec set]
}
test {latencystats: bad configure percentiles} {
r config resetstat
set configlatencyline [r config get latency-tracking-info-percentiles]
catch {r CONFIG SET latency-tracking-info-percentiles "10.0 50.0 a"} e
assert_match {ERR CONFIG SET failed*} $e
assert_equal [s total_error_replies] 1
assert_match [r config get latency-tracking-info-percentiles] $configlatencyline
catch {r CONFIG SET latency-tracking-info-percentiles "10.0 50.0 101.0"} e
assert_match {ERR CONFIG SET failed*} $e
assert_equal [s total_error_replies] 2
assert_match [r config get latency-tracking-info-percentiles] $configlatencyline
r config resetstat
assert_match {} [errorstat ERR]
}
test {latencystats: blocking commands} {
r config resetstat
r CONFIG SET latency-tracking yes
r CONFIG SET latency-tracking-info-percentiles "50.0 99.0 99.9"
set rd [redis_deferring_client]
r del list1{t}
$rd blpop list1{t} 0
wait_for_blocked_client
r lpush list1{t} a
assert_equal [$rd read] {list1{t} a}
$rd blpop list1{t} 0
wait_for_blocked_client
r lpush list1{t} b
assert_equal [$rd read] {list1{t} b}
assert_match {*p50.000000=*,p99.000000=*,p99.900000=*} [latency_percentiles_usec blpop]
$rd close
}
test {latencystats: measure latency} {
r config resetstat
r CONFIG SET latency-tracking yes
r CONFIG SET latency-tracking-info-percentiles "50.0"
r DEBUG sleep 0.05
r SET k v
set latencystatline_debug [latency_percentiles_usec debug]
set latencystatline_set [latency_percentiles_usec set]
regexp "p50.000000=(.+\..+)" $latencystatline_debug -> p50_debug
regexp "p50.000000=(.+\..+)" $latencystatline_set -> p50_set
assert {$p50_debug >= 50000}
assert {$p50_set >= 0}
assert {$p50_debug >= $p50_set}
} {} {needs:debug}
test {errorstats: failed call authentication error} {
r config resetstat
assert_match {} [errorstat ERR]

View File

@ -1,8 +1,59 @@
proc latency_histogram {cmd} {
return [lindex [r latency histogram $cmd] 1]
}
start_server {tags {"latency-monitor needs:latency"}} {
# Set a threshold high enough to avoid spurious latency events.
r config set latency-monitor-threshold 200
r latency reset
test {LATENCY HISTOGRAM with empty histogram} {
r config resetstat
assert_match {} [latency_histogram set]
assert {[llength [r latency histogram]] == 0}
}
test {LATENCY HISTOGRAM all commands} {
r config resetstat
r set a b
r set c d
assert_match {calls 2 histogram_usec *} [latency_histogram set]
}
test {LATENCY HISTOGRAM with a subset of commands} {
r config resetstat
r set a b
r set c d
r get a
r hset f k v
r hgetall f
assert_match {calls 2 histogram_usec *} [latency_histogram set]
assert_match {calls 1 histogram_usec *} [latency_histogram hset]
assert_match {calls 1 histogram_usec *} [latency_histogram hgetall]
assert_match {calls 1 histogram_usec *} [latency_histogram get]
assert {[llength [r latency histogram]] == 8}
assert {[llength [r latency histogram set get]] == 4}
}
test {LATENCY HISTOGRAM command} {
r config resetstat
r set a b
r get a
assert {[llength [r latency histogram]] == 4}
assert {[llength [r latency histogram set get]] == 4}
}
test {LATENCY HISTOGRAM with wrong command name skips the invalid one} {
r config resetstat
assert {[llength [r latency histogram blabla]] == 0}
assert {[llength [r latency histogram blabla blabla2 set get]] == 0}
r set a b
r get a
assert_match {calls 1 histogram_usec *} [lindex [r latency histogram blabla blabla2 set get] 1]
assert_match {calls 1 histogram_usec *} [lindex [r latency histogram blabla blabla2 set get] 3]
assert {[string length [r latency histogram blabla set get]] > 0}
}
test {Test latency events logging} {
r debug sleep 0.3
after 1100

View File

@ -326,6 +326,10 @@ proc test_slave_buffers {test_name cmd_count payload_len limit_memory pipeline}
$master config set client-output-buffer-limit "replica 100000000 100000000 300"
$master config set repl-backlog-size [expr {10*1024}]
# disable latency tracking
$master config set latency-tracking no
$slave config set latency-tracking no
$slave slaveof $master_host $master_port
wait_for_condition 50 100 {
[s 0 master_link_status] eq {up}
@ -412,6 +416,7 @@ test_slave_buffers "replica buffer don't induce eviction" 100000 100 1 0
start_server {tags {"maxmemory external:skip"}} {
test {Don't rehash if used memory exceeds maxmemory after rehash} {
r config set latency-tracking no
r config set maxmemory 0
r config set maxmemory-policy allkeys-random
@ -432,6 +437,7 @@ start_server {tags {"maxmemory external:skip"}} {
start_server {tags {"maxmemory external:skip"}} {
test {client tracking don't cause eviction feedback loop} {
r config set latency-tracking no
r config set maxmemory 0
r config set maxmemory-policy allkeys-lru
r config set maxmemory-eviction-tenacity 100

View File

@ -2,6 +2,10 @@ set testmodule [file normalize tests/modules/blockonbackground.so]
source tests/support/util.tcl
proc latency_percentiles_usec {cmd} {
return [latencyrstat_percentiles $cmd r]
}
start_server {tags {"modules"}} {
r module load $testmodule
@ -16,16 +20,21 @@ start_server {tags {"modules"}} {
assert_equal [r slowlog len] 0
}
r config resetstat
r config set latency-tracking yes
r config set latency-tracking-info-percentiles "50.0"
r block.debug 200 10000
if {!$::no_latency} {
assert_equal [r slowlog len] 1
}
set cmdstatline [cmdrstat block.debug r]
set latencystatline_debug [latency_percentiles_usec block.debug]
regexp "calls=1,usec=(.*?),usec_per_call=(.*?),rejected_calls=0,failed_calls=0" $cmdstatline usec usec_per_call
regexp "calls=1,usec=(.*?),usec_per_call=(.*?),rejected_calls=0,failed_calls=0" $cmdstatline -> usec usec_per_call
regexp "p50.000000=(.+\..+)" $latencystatline_debug -> p50
assert {$usec >= 100000}
assert {$usec_per_call >= 100000}
assert {$p50 >= 100000}
}
test { blocked clients time tracking - check blocked command that uses RedisModule_BlockedClientMeasureTimeStart() is tracking background time even in timeout } {

View File

@ -115,6 +115,7 @@ start_server {tags {"obuf-limits external:skip"}} {
}
test {No response for single command if client output buffer hard limit is enforced} {
r config set latency-tracking no
r config set client-output-buffer-limit {normal 100000 0 0}
# Total size of all items must be more than 100k
set item [string repeat "x" 1000]