redis/src/rdb.h

162 lines
7.4 KiB
C
Raw Normal View History

/*
* Copyright (c) 2009-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of the Redis Source Available License 2.0
* (RSALv2) or the Server Side Public License v1 (SSPLv1).
*/
2015-07-27 09:41:48 +02:00
#ifndef __RDB_H
#define __RDB_H
#include <stdio.h>
#include "rio.h"
/* TBD: include only necessary headers. */
#include "server.h"
/* The current RDB version. When the format changes in a way that is no longer
* backward compatible this number gets incremented. */
Replace cluster metadata with slot specific dictionaries (#11695) This is an implementation of https://github.com/redis/redis/issues/10589 that eliminates 16 bytes per entry in cluster mode, that are currently used to create a linked list between entries in the same slot. Main idea is splitting main dictionary into 16k smaller dictionaries (one per slot), so we can perform all slot specific operations, such as iteration, without any additional info in the `dictEntry`. For Redis cluster, the expectation is that there will be a larger number of keys, so the fixed overhead of 16k dictionaries will be The expire dictionary is also split up so that each slot is logically decoupled, so that in subsequent revisions we will be able to atomically flush a slot of data. ## Important changes * Incremental rehashing - one big change here is that it's not one, but rather up to 16k dictionaries that can be rehashing at the same time, in order to keep track of them, we introduce a separate queue for dictionaries that are rehashing. Also instead of rehashing a single dictionary, cron job will now try to rehash as many as it can in 1ms. * getRandomKey - now needs to not only select a random key, from the random bucket, but also needs to select a random dictionary. Fairness is a major concern here, as it's possible that keys can be unevenly distributed across the slots. In order to address this search we introduced binary index tree). With that data structure we are able to efficiently find a random slot using binary search in O(log^2(slot count)) time. * Iteration efficiency - when iterating dictionary with a lot of empty slots, we want to skip them efficiently. We can do this using same binary index that is used for random key selection, this index allows us to find a slot for a specific key index. For example if there are 10 keys in the slot 0, then we can quickly find a slot that contains 11th key using binary search on top of the binary index tree. * scan API - in order to perform a scan across the entire DB, the cursor now needs to not only save position within the dictionary but also the slot id. In this change we append slot id into LSB of the cursor so it can be passed around between client and the server. This has interesting side effect, now you'll be able to start scanning specific slot by simply providing slot id as a cursor value. The plan is to not document this as defined behavior, however. It's also worth nothing the SCAN API is now technically incompatible with previous versions, although practically we don't believe it's an issue. * Checksum calculation optimizations - During command execution, we know that all of the keys are from the same slot (outside of a few notable exceptions such as cross slot scripts and modules). We don't want to compute the checksum multiple multiple times, hence we are relying on cached slot id in the client during the command executions. All operations that access random keys, either should pass in the known slot or recompute the slot. * Slot info in RDB - in order to resize individual dictionaries correctly, while loading RDB, it's not enough to know total number of keys (of course we could approximate number of keys per slot, but it won't be precise). To address this issue, we've added additional metadata into RDB that contains number of keys in each slot, which can be used as a hint during loading. * DB size - besides `DBSIZE` API, we need to know size of the DB in many places want, in order to avoid scanning all dictionaries and summing up their sizes in a loop, we've introduced a new field into `redisDb` that keeps track of `key_count`. This way we can keep DBSIZE operation O(1). This is also kept for O(1) expires computation as well. ## Performance This change improves SET performance in cluster mode by ~5%, most of the gains come from us not having to maintain linked lists for keys in slot, non-cluster mode has same performance. For workloads that rely on evictions, the performance is similar because of the extra overhead for finding keys to evict. RDB loading performance is slightly reduced, as the slot of each key needs to be computed during the load. ## Interface changes * Removed `overhead.hashtable.slot-to-keys` to `MEMORY STATS` * Scan API will now require 64 bits to store the cursor, even on 32 bit systems, as the slot information will be stored. * New RDB version to support the new op code for SLOT information. --------- Co-authored-by: Vitaly Arbuzov <arvit@amazon.com> Co-authored-by: Harkrishn Patro <harkrisp@amazon.com> Co-authored-by: Roshan Khatri <rvkhatri@amazon.com> Co-authored-by: Madelyn Olson <madelyneolson@gmail.com> Co-authored-by: Oran Agra <oran@redislabs.com>
2023-10-15 08:58:26 +02:00
#define RDB_VERSION 12
2011-05-13 23:24:19 +02:00
/* Defines related to the dump file format. To store 32 bits lengths for short
* keys requires a lot of space, so we check the most significant 2 bits of
* the first byte to interpreter the length:
*
* 00|XXXXXX => if the two MSB are 00 the len is the 6 bits of this byte
* 01|XXXXXX XXXXXXXX => 01, the len is 14 bits, 6 bits + 8 bits of next byte
* 10|000000 [32 bit integer] => A full 32 bit len in net byte order will follow
* 10|000001 [64 bit integer] => A full 64 bit len in net byte order will follow
* 11|OBKIND this means: specially encoded object will follow. The six bits
2011-05-13 23:24:19 +02:00
* number specify the kind of object that follows.
2015-07-27 09:41:48 +02:00
* See the RDB_ENC_* defines.
2011-05-13 23:24:19 +02:00
*
2013-01-16 18:00:20 +01:00
* Lengths up to 63 are stored using a single byte, most DB keys, and may
2011-05-13 23:24:19 +02:00
* values, will fit inside. */
2015-07-27 09:41:48 +02:00
#define RDB_6BITLEN 0
#define RDB_14BITLEN 1
#define RDB_32BITLEN 0x80
#define RDB_64BITLEN 0x81
2015-07-27 09:41:48 +02:00
#define RDB_ENCVAL 3
2016-06-01 20:18:28 +02:00
#define RDB_LENERR UINT64_MAX
2011-05-13 23:24:19 +02:00
/* When a length of a string object stored on disk has the first two bits
* set, the remaining six bits specify a special encoding for the object
2011-05-13 23:24:19 +02:00
* accordingly to the following defines: */
2015-07-27 09:41:48 +02:00
#define RDB_ENC_INT8 0 /* 8 bit signed integer */
#define RDB_ENC_INT16 1 /* 16 bit signed integer */
#define RDB_ENC_INT32 2 /* 32 bit signed integer */
#define RDB_ENC_LZF 3 /* string compressed with FASTLZ */
2011-05-13 23:24:19 +02:00
/* Map object types to RDB object types. Macros starting with OBJ_ are for
* memory storage and may change. Instead RDB types must be fixed because
* we store them on disk. */
2015-07-27 09:41:48 +02:00
#define RDB_TYPE_STRING 0
#define RDB_TYPE_LIST 1
#define RDB_TYPE_SET 2
#define RDB_TYPE_ZSET 3
#define RDB_TYPE_HASH 4
#define RDB_TYPE_ZSET_2 5 /* ZSET version 2 with doubles stored in binary. */
#define RDB_TYPE_MODULE_PRE_GA 6 /* Used in 4.0 release candidates */
RDB modules values serialization format version 2. The original RDB serialization format was not parsable without the module loaded, becuase the structure was managed only by the module itself. Moreover RDB is a streaming protocol in the sense that it is both produce di an append-only fashion, and is also sometimes directly sent to the socket (in the case of diskless replication). The fact that modules values cannot be parsed without the relevant module loaded is a problem in many ways: RDB checking tools must have loaded modules even for doing things not involving the value at all, like splitting an RDB into N RDBs by key or alike, or just checking the RDB for sanity. In theory module values could be just a blob of data with a prefixed length in order for us to be able to skip it. However prefixing the values with a length would mean one of the following: 1. To be able to write some data at a previous offset. This breaks stremaing. 2. To bufferize values before outputting them. This breaks performances. 3. To have some chunked RDB output format. This breaks simplicity. Moreover, the above solution, still makes module values a totally opaque matter, with the fowllowing problems: 1. The RDB check tool can just skip the value without being able to at least check the general structure. For datasets composed mostly of modules values this means to just check the outer level of the RDB not actually doing any checko on most of the data itself. 2. It is not possible to do any recovering or processing of data for which a module no longer exists in the future, or is unknown. So this commit implements a different solution. The modules RDB serialization API is composed if well defined calls to store integers, floats, doubles or strings. After this commit, the parts generated by the module API have a one-byte prefix for each of the above emitted parts, and there is a final EOF byte as well. So even if we don't know exactly how to interpret a module value, we can always parse it at an high level, check the overall structure, understand the types used to store the information, and easily skip the whole value. The change is backward compatible: older RDB files can be still loaded since the new encoding has a new RDB type: MODULE_2 (of value 7). The commit also implements the ability to check RDB files for sanity taking advantage of the new feature.
2017-06-27 13:09:33 +02:00
#define RDB_TYPE_MODULE_2 7 /* Module value with annotations for parsing without
the generating module being loaded. */
2015-07-27 09:41:48 +02:00
#define RDB_TYPE_HASH_ZIPMAP 9
#define RDB_TYPE_LIST_ZIPLIST 10
#define RDB_TYPE_SET_INTSET 11
#define RDB_TYPE_ZSET_ZIPLIST 12
#define RDB_TYPE_HASH_ZIPLIST 13
#define RDB_TYPE_LIST_QUICKLIST 14
#define RDB_TYPE_STREAM_LISTPACKS 15
#define RDB_TYPE_HASH_LISTPACK 16
Replace all usage of ziplist with listpack for t_zset (#9366) Part two of implementing #8702 (zset), after #8887. ## Description of the feature Replaced all uses of ziplist with listpack in t_zset, and optimized some of the code to optimize performance. ## Rdb format changes New `RDB_TYPE_ZSET_LISTPACK` rdb type. ## Rdb loading improvements: 1) Pre-expansion of dict for validation of duplicate data for listpack and ziplist. 2) Simplifying the release of empty key objects when RDB loading. 3) Unify ziplist and listpack data verify methods for zset and hash, and move code to rdb.c. ## Interface changes 1) New `zset-max-listpack-entries` config is an alias for `zset-max-ziplist-entries` (same with `zset-max-listpack-value`). 2) OBJECT ENCODING will return listpack instead of ziplist. ## Listpack improvements: 1) Add `lpDeleteRange` and `lpDeleteRangeWithEntry` functions to delete a range of entries from listpack. 2) Improve the performance of `lpCompare`, converting from string to integer is faster than converting from integer to string. 3) Replace `snprintf` with `ll2string` to improve performance in converting numbers to strings in `lpGet()`. ## Zset improvements: 1) Improve the performance of `zzlFind` method, use `lpFind` instead of `lpCompare` in a loop. 2) Use `lpDeleteRangeWithEntry` instead of `lpDelete` twice to delete a element of zset. ## Tests 1) Add some unittests for `lpDeleteRange` and `lpDeleteRangeWithEntry` function. 2) Add zset RDB loading test. 3) Add benchmark test for `lpCompare` and `ziplsitCompare`. 4) Add empty listpack zset corrupt dump test.
2021-09-09 17:18:53 +02:00
#define RDB_TYPE_ZSET_LISTPACK 17
#define RDB_TYPE_LIST_QUICKLIST_2 18
Add stream consumer group lag tracking and reporting (#9127) Adds the ability to track the lag of a consumer group (CG), that is, the number of entries yet-to-be-delivered from the stream. The proposed constant-time solution is in the spirit of "best-effort." Partially addresses #8737. ## Description of approach We add a new "entries_added" property to the stream. This starts at 0 for a new stream and is incremented by 1 with every `XADD`. It is essentially an all-time counter of the entries added to the stream. Given the stream's length and this counter value, we can trivially find the logical "entries_added" counter of the first ID if and only if the stream is contiguous. A fragmented stream contains one or more tombstones generated by `XDEL`s. The new "xdel_max_id" stream property tracks the latest tombstone. The CG also tracks its last delivered ID's as an "entries_read" counter and increments it independently when delivering new messages, unless the this read counter is invalid (-1 means invalid offset). When the CG's counter is available, the reported lag is the difference between added and read counters. Lastly, this also adds a "first_id" field to the stream structure in order to make looking it up cheaper in most cases. ## Limitations There are two cases in which the mechanism isn't able to track the lag. In these cases, `XINFO` replies with `null` in the "lag" field. The first case is when a CG is created with an arbitrary last delivered ID, that isn't "0-0", nor the first or the last entries of the stream. In this case, it is impossible to obtain a valid read counter (short of an O(N) operation). The second case is when there are one or more tombstones fragmenting the stream's entries range. In both cases, given enough time and assuming that the consumers are active (reading and lacking) and advancing, the CG should be able to catch up with the tip of the stream and report zero lag. Once that's achieved, lag tracking would resume as normal (until the next tombstone is set). ## API changes * `XGROUP CREATE` added with the optional named argument `[ENTRIESREAD entries-read]` for explicitly specifying the new CG's counter. * `XGROUP SETID` added with an optional positional argument `[ENTRIESREAD entries-read]` for specifying the CG's counter. * `XINFO` reports the maximal tombstone ID, the recorded first entry ID, and total number of entries added to the stream. * `XINFO` reports the current lag and logical read counter of CGs. * `XSETID` is an internal command that's used in replication/aof. It has been added with the optional positional arguments `[ENTRIESADDED entries-added] [MAXDELETEDID max-deleted-entry-id]` for propagating the CG's offset and maximal tombstone ID of the stream. ## The generic unsolved problem The current stream implementation doesn't provide an efficient way to obtain the approximate/exact size of a range of entries. While it could've been nice to have that ability (#5813) in general, let alone specifically in the context of CGs, the risk and complexities involved in such implementation are in all likelihood prohibitive. ## A refactoring note The `streamGetEdgeID` has been refactored to accommodate both the existing seek of any entry as well as seeking non-deleted entries (the addition of the `skip_tombstones` argument). Furthermore, this refactoring also migrated the seek logic to use the `streamIterator` (rather than `raxIterator`) that was, in turn, extended with the `skip_tombstones` Boolean struct field to control the emission of these. Co-authored-by: Guy Benoish <guy.benoish@redislabs.com> Co-authored-by: Oran Agra <oran@redislabs.com>
2022-02-23 21:34:58 +01:00
#define RDB_TYPE_STREAM_LISTPACKS_2 19
#define RDB_TYPE_SET_LISTPACK 20
2022-11-30 13:21:31 +01:00
#define RDB_TYPE_STREAM_LISTPACKS_3 21
/* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType(), and rdb_type_string[] */
/* Test if a type is an object type. */
2022-11-30 13:21:31 +01:00
#define rdbIsObjectType(t) (((t) >= 0 && (t) <= 7) || ((t) >= 9 && (t) <= 21))
/* Special RDB opcodes (saved/loaded with rdbSaveType/rdbLoadType). */
Replace cluster metadata with slot specific dictionaries (#11695) This is an implementation of https://github.com/redis/redis/issues/10589 that eliminates 16 bytes per entry in cluster mode, that are currently used to create a linked list between entries in the same slot. Main idea is splitting main dictionary into 16k smaller dictionaries (one per slot), so we can perform all slot specific operations, such as iteration, without any additional info in the `dictEntry`. For Redis cluster, the expectation is that there will be a larger number of keys, so the fixed overhead of 16k dictionaries will be The expire dictionary is also split up so that each slot is logically decoupled, so that in subsequent revisions we will be able to atomically flush a slot of data. ## Important changes * Incremental rehashing - one big change here is that it's not one, but rather up to 16k dictionaries that can be rehashing at the same time, in order to keep track of them, we introduce a separate queue for dictionaries that are rehashing. Also instead of rehashing a single dictionary, cron job will now try to rehash as many as it can in 1ms. * getRandomKey - now needs to not only select a random key, from the random bucket, but also needs to select a random dictionary. Fairness is a major concern here, as it's possible that keys can be unevenly distributed across the slots. In order to address this search we introduced binary index tree). With that data structure we are able to efficiently find a random slot using binary search in O(log^2(slot count)) time. * Iteration efficiency - when iterating dictionary with a lot of empty slots, we want to skip them efficiently. We can do this using same binary index that is used for random key selection, this index allows us to find a slot for a specific key index. For example if there are 10 keys in the slot 0, then we can quickly find a slot that contains 11th key using binary search on top of the binary index tree. * scan API - in order to perform a scan across the entire DB, the cursor now needs to not only save position within the dictionary but also the slot id. In this change we append slot id into LSB of the cursor so it can be passed around between client and the server. This has interesting side effect, now you'll be able to start scanning specific slot by simply providing slot id as a cursor value. The plan is to not document this as defined behavior, however. It's also worth nothing the SCAN API is now technically incompatible with previous versions, although practically we don't believe it's an issue. * Checksum calculation optimizations - During command execution, we know that all of the keys are from the same slot (outside of a few notable exceptions such as cross slot scripts and modules). We don't want to compute the checksum multiple multiple times, hence we are relying on cached slot id in the client during the command executions. All operations that access random keys, either should pass in the known slot or recompute the slot. * Slot info in RDB - in order to resize individual dictionaries correctly, while loading RDB, it's not enough to know total number of keys (of course we could approximate number of keys per slot, but it won't be precise). To address this issue, we've added additional metadata into RDB that contains number of keys in each slot, which can be used as a hint during loading. * DB size - besides `DBSIZE` API, we need to know size of the DB in many places want, in order to avoid scanning all dictionaries and summing up their sizes in a loop, we've introduced a new field into `redisDb` that keeps track of `key_count`. This way we can keep DBSIZE operation O(1). This is also kept for O(1) expires computation as well. ## Performance This change improves SET performance in cluster mode by ~5%, most of the gains come from us not having to maintain linked lists for keys in slot, non-cluster mode has same performance. For workloads that rely on evictions, the performance is similar because of the extra overhead for finding keys to evict. RDB loading performance is slightly reduced, as the slot of each key needs to be computed during the load. ## Interface changes * Removed `overhead.hashtable.slot-to-keys` to `MEMORY STATS` * Scan API will now require 64 bits to store the cursor, even on 32 bit systems, as the slot information will be stored. * New RDB version to support the new op code for SLOT information. --------- Co-authored-by: Vitaly Arbuzov <arvit@amazon.com> Co-authored-by: Harkrishn Patro <harkrisp@amazon.com> Co-authored-by: Roshan Khatri <rvkhatri@amazon.com> Co-authored-by: Madelyn Olson <madelyneolson@gmail.com> Co-authored-by: Oran Agra <oran@redislabs.com>
2023-10-15 08:58:26 +02:00
#define RDB_OPCODE_SLOT_INFO 244 /* Individual slot info, such as slot id and size (cluster mode only). */
Functions: Move library meta data to be part of the library payload. (#10500) ## Move library meta data to be part of the library payload. Following the discussion on https://github.com/redis/redis/issues/10429 and the intention to add (in the future) library versioning support, we believe that the entire library metadata (like name and engine) should be part of the library payload and not provided by the `FUNCTION LOAD` command. The reasoning behind this is that the programmer who developed the library should be the one who set those values (name, engine, and in the future also version). **It is not the responsibility of the admin who load the library into the database.** The PR moves all the library metadata (engine and function name) to be part of the library payload. The metadata needs to be provided on the first line of the payload using the shebang format (`#!<engine> name=<name>`), example: ```lua #!lua name=test redis.register_function('foo', function() return 1 end) ``` The above script will run on the Lua engine and will create a library called `test`. ## API Changes (compare to 7.0 rc2) * `FUNCTION LOAD` command was change and now it simply gets the library payload and extract the engine and name from the payload. In addition, the command will now return the function name which can later be used on `FUNCTION DELETE` and `FUNCTION LIST`. * The description field was completely removed from`FUNCTION LOAD`, and `FUNCTION LIST` ## Breaking Changes (compare to 7.0 rc2) * Library description was removed (we can re-add it in the future either as part of the shebang line or an additional line). * Loading an AOF file that was generated by either 7.0 rc1 or 7.0 rc2 will fail because the old command syntax is invalid. ## Notes * Loading an RDB file that was generated by rc1 / rc2 **is** supported, Redis will automatically add the shebang to the libraries payloads (we can probably delete that code after 7.0.3 or so since there's no need to keep supporting upgrades from an RC build).
2022-04-05 09:27:24 +02:00
#define RDB_OPCODE_FUNCTION2 245 /* function library data */
#define RDB_OPCODE_FUNCTION_PRE_GA 246 /* old function library data for 7.0 rc1 and rc2 */
#define RDB_OPCODE_MODULE_AUX 247 /* Module auxiliary data. */
#define RDB_OPCODE_IDLE 248 /* LRU idle time. */
#define RDB_OPCODE_FREQ 249 /* LFU frequency. */
#define RDB_OPCODE_AUX 250 /* RDB aux field. */
#define RDB_OPCODE_RESIZEDB 251 /* Hash table resize hint. */
#define RDB_OPCODE_EXPIRETIME_MS 252 /* Expire time in milliseconds. */
#define RDB_OPCODE_EXPIRETIME 253 /* Old expire time in seconds. */
#define RDB_OPCODE_SELECTDB 254 /* DB number of the following keys. */
#define RDB_OPCODE_EOF 255 /* End of the RDB file. */
RDB modules values serialization format version 2. The original RDB serialization format was not parsable without the module loaded, becuase the structure was managed only by the module itself. Moreover RDB is a streaming protocol in the sense that it is both produce di an append-only fashion, and is also sometimes directly sent to the socket (in the case of diskless replication). The fact that modules values cannot be parsed without the relevant module loaded is a problem in many ways: RDB checking tools must have loaded modules even for doing things not involving the value at all, like splitting an RDB into N RDBs by key or alike, or just checking the RDB for sanity. In theory module values could be just a blob of data with a prefixed length in order for us to be able to skip it. However prefixing the values with a length would mean one of the following: 1. To be able to write some data at a previous offset. This breaks stremaing. 2. To bufferize values before outputting them. This breaks performances. 3. To have some chunked RDB output format. This breaks simplicity. Moreover, the above solution, still makes module values a totally opaque matter, with the fowllowing problems: 1. The RDB check tool can just skip the value without being able to at least check the general structure. For datasets composed mostly of modules values this means to just check the outer level of the RDB not actually doing any checko on most of the data itself. 2. It is not possible to do any recovering or processing of data for which a module no longer exists in the future, or is unknown. So this commit implements a different solution. The modules RDB serialization API is composed if well defined calls to store integers, floats, doubles or strings. After this commit, the parts generated by the module API have a one-byte prefix for each of the above emitted parts, and there is a final EOF byte as well. So even if we don't know exactly how to interpret a module value, we can always parse it at an high level, check the overall structure, understand the types used to store the information, and easily skip the whole value. The change is backward compatible: older RDB files can be still loaded since the new encoding has a new RDB type: MODULE_2 (of value 7). The commit also implements the ability to check RDB files for sanity taking advantage of the new feature.
2017-06-27 13:09:33 +02:00
/* Module serialized values sub opcodes */
#define RDB_MODULE_OPCODE_EOF 0 /* End of module value. */
#define RDB_MODULE_OPCODE_SINT 1 /* Signed integer. */
#define RDB_MODULE_OPCODE_UINT 2 /* Unsigned integer. */
#define RDB_MODULE_OPCODE_FLOAT 3 /* Float. */
#define RDB_MODULE_OPCODE_DOUBLE 4 /* Double. */
#define RDB_MODULE_OPCODE_STRING 5 /* String. */
/* rdbLoad...() functions flags. */
#define RDB_LOAD_NONE 0
#define RDB_LOAD_ENC (1<<0)
#define RDB_LOAD_PLAIN (1<<1)
#define RDB_LOAD_SDS (1<<2)
/* flags on the purpose of rdb save or load */
Refine the purpose of rdb saving with accurate flags (#12925) In Redis, rdb is produced in three scenarios mainly. - backup, such as `bgsave` and `save` command - full sync in replication - aof rewrite if `aof-use-rdb-preamble` is yes We also have some RDB flags to identify the purpose of rdb saving. ```C /* flags on the purpose of rdb save or load */ #define RDBFLAGS_NONE 0 /* No special RDB loading. */ #define RDBFLAGS_AOF_PREAMBLE (1<<0) /* Load/save the RDB as AOF preamble. */ #define RDBFLAGS_REPLICATION (1<<1) /* Load/save for SYNC. */ ``` But currently, it seems that these flags and purposes of rdb saving don't exactly match. I find it in `rdbSaveRioWithEOFMark` which calls `startSaving` with `RDBFLAGS_REPLICATION` but `rdbSaveRio` with `RDBFLAGS_NONE`. ```C int rdbSaveRioWithEOFMark(int req, rio *rdb, int *error, rdbSaveInfo *rsi) { char eofmark[RDB_EOF_MARK_SIZE]; startSaving(RDBFLAGS_REPLICATION); getRandomHexChars(eofmark,RDB_EOF_MARK_SIZE); if (error) *error = 0; if (rioWrite(rdb,"$EOF:",5) == 0) goto werr; if (rioWrite(rdb,eofmark,RDB_EOF_MARK_SIZE) == 0) goto werr; if (rioWrite(rdb,"\r\n",2) == 0) goto werr; if (rdbSaveRio(req,rdb,error,RDBFLAGS_NONE,rsi) == C_ERR) goto werr; if (rioWrite(rdb,eofmark,RDB_EOF_MARK_SIZE) == 0) goto werr; stopSaving(1); return C_OK; werr: /* Write error. */ /* Set 'error' only if not already set by rdbSaveRio() call. */ if (error && *error == 0) *error = errno; stopSaving(0); return C_ERR; } ``` In this PR, I refine the purpose of rdb saving with accurate flags.
2024-02-01 12:41:02 +01:00
#define RDBFLAGS_NONE 0 /* No special RDB loading or saving. */
#define RDBFLAGS_AOF_PREAMBLE (1<<0) /* Load/save the RDB as AOF preamble. */
#define RDBFLAGS_REPLICATION (1<<1) /* Load/save for SYNC. */
#define RDBFLAGS_ALLOW_DUP (1<<2) /* Allow duplicated keys when loading.*/
#define RDBFLAGS_FEED_REPL (1<<3) /* Feed replication stream when loading.*/
Reclaim page cache of RDB file (#11248) # Background The RDB file is usually generated and used once and seldom used again, but the content would reside in page cache until OS evicts it. A potential problem is that once the free memory exhausts, the OS have to reclaim some memory from page cache or swap anonymous page out, which may result in a jitters to the Redis service. Supposing an exact scenario, a high-capacity machine hosts many redis instances, and we're upgrading the Redis together. The page cache in host machine increases as RDBs are generated. Once the free memory drop into low watermark(which is more likely to happen in older Linux kernel like 3.10, before [watermark_scale_factor](https://lore.kernel.org/lkml/1455813719-2395-1-git-send-email-hannes@cmpxchg.org/) is introduced, the `low watermark` is linear to `min watermark`, and there'is not too much buffer space for `kswapd` to be wake up to reclaim memory), a `direct reclaim` happens, which means the process would stall to wait for memory allocation. # What the PR does The PR introduces a capability to reclaim the cache when the RDB is operated. Generally there're two cases, read and write the RDB. For read it's a little messy to address the incremental reclaim, so the reclaim is done in one go in background after the load is finished to avoid blocking the work thread. For write, incremental reclaim amortizes the work of reclaim so no need to put it into background, and the peak watermark of cache can be reduced in this way. Two cases are addresses specially, replication and restart, for both of which the cache is leveraged to speed up the processing, so the reclaim is postponed to a right time. To do this, a flag is added to`rdbSave` and `rdbLoad` to control whether the cache need to be kept, with the default value false. # Something deserve noting 1. Though `posix_fadvise` is the POSIX standard, but only few platform support it, e.g. Linux, FreeBSD 10.0. 2. In Linux `posix_fadvise` only take effect on writeback-ed pages, so a `sync`(or `fsync`, `fdatasync`) is needed to flush the dirty page before `posix_fadvise` if we reclaim write cache. # About test A unit test is added to verify the effect of `posix_fadvise`. In integration test overall cache increase is checked, as well as the cache backed by RDB as a specific TCL test is executed in isolated Github action job.
2023-02-12 08:23:29 +01:00
#define RDBFLAGS_KEEP_CACHE (1<<4) /* Don't reclaim cache after rdb file is generated */
2016-08-09 11:07:32 +02:00
/* When rdbLoadObject() returns NULL, the err flag is
* set to hold the type of error that occurred */
#define RDB_LOAD_ERR_EMPTY_KEY 1 /* Error of empty key */
#define RDB_LOAD_ERR_OTHER 2 /* Any other errors */
Avoid saving module aux on RDB if no aux data was saved by the module. (#11374) ### Background The issue is that when saving an RDB with module AUX data, the module AUX metadata (moduleid, when, ...) is saved to the RDB even though the module did not saved any actual data. This prevent loading the RDB in the absence of the module (although there is no actual data in the RDB that requires the module to be loaded). ### Solution The solution suggested in this PR is that module AUX will be saved on the RDB only if the module actually saved something during `aux_save` function. To support backward compatibility, we introduce `aux_save2` callback that acts the same as `aux_save` with the tiny change of avoid saving the aux field if no data was actually saved by the module. Modules can use the new API to make sure that if they have no data to save, then it will be possible to load the created RDB even without the module. ### Concerns A module may register for the aux load and save hooks just in order to be notified when saving or loading starts or completed (there are better ways to do that, but it still possible that someone used it). However, if a module didn't save a single field in the save callback, it means it's not allowed to read in the read callback, since it has no way to distinguish between empty and non-empty payloads. furthermore, it means that if the module did that, it must never change it, since it'll break compatibility with it's old RDB files, so this is really not a valid use case. Since some modules (ones who currently save one field indicating an empty payload), need to know if saving an empty payload is valid, and if Redis is gonna ignore an empty payload or store it, we opted to add a new API (rather than change behavior of an existing API and expect modules to check the redis version) ### Technical Details To avoid saving AUX data on RDB, we change the code to first save the AUX metadata (moduleid, when, ...) into a temporary buffer. The buffer is then flushed to the rio at the first time the module makes a write operation inside the `aux_save` function. If the module saves nothing (and `aux_save2` was used), the entire temporary buffer is simply dropped and no data about this AUX field is saved to the RDB. This make it possible to load the RDB even in the absence of the module. Test was added to verify the fix.
2022-10-18 18:45:46 +02:00
ssize_t rdbWriteRaw(rio *rdb, void *p, size_t len);
2011-05-13 23:24:19 +02:00
int rdbSaveType(rio *rdb, unsigned char type);
int rdbLoadType(rio *rdb);
time_t rdbLoadTime(rio *rdb);
int rdbSaveLen(rio *rdb, uint64_t len);
ssize_t rdbSaveMillisecondTime(rio *rdb, long long t);
long long rdbLoadMillisecondTime(rio *rdb, int rdbver);
uint64_t rdbLoadLen(rio *rdb, int *isencoded);
int rdbLoadLenByRef(rio *rdb, int *isencoded, uint64_t *lenptr);
2011-05-13 23:24:19 +02:00
int rdbSaveObjectType(rio *rdb, robj *o);
int rdbLoadObjectType(rio *rdb);
int rdbLoad(char *filename, rdbSaveInfo *rsi, int rdbflags);
Reclaim page cache of RDB file (#11248) # Background The RDB file is usually generated and used once and seldom used again, but the content would reside in page cache until OS evicts it. A potential problem is that once the free memory exhausts, the OS have to reclaim some memory from page cache or swap anonymous page out, which may result in a jitters to the Redis service. Supposing an exact scenario, a high-capacity machine hosts many redis instances, and we're upgrading the Redis together. The page cache in host machine increases as RDBs are generated. Once the free memory drop into low watermark(which is more likely to happen in older Linux kernel like 3.10, before [watermark_scale_factor](https://lore.kernel.org/lkml/1455813719-2395-1-git-send-email-hannes@cmpxchg.org/) is introduced, the `low watermark` is linear to `min watermark`, and there'is not too much buffer space for `kswapd` to be wake up to reclaim memory), a `direct reclaim` happens, which means the process would stall to wait for memory allocation. # What the PR does The PR introduces a capability to reclaim the cache when the RDB is operated. Generally there're two cases, read and write the RDB. For read it's a little messy to address the incremental reclaim, so the reclaim is done in one go in background after the load is finished to avoid blocking the work thread. For write, incremental reclaim amortizes the work of reclaim so no need to put it into background, and the peak watermark of cache can be reduced in this way. Two cases are addresses specially, replication and restart, for both of which the cache is leveraged to speed up the processing, so the reclaim is postponed to a right time. To do this, a flag is added to`rdbSave` and `rdbLoad` to control whether the cache need to be kept, with the default value false. # Something deserve noting 1. Though `posix_fadvise` is the POSIX standard, but only few platform support it, e.g. Linux, FreeBSD 10.0. 2. In Linux `posix_fadvise` only take effect on writeback-ed pages, so a `sync`(or `fsync`, `fdatasync`) is needed to flush the dirty page before `posix_fadvise` if we reclaim write cache. # About test A unit test is added to verify the effect of `posix_fadvise`. In integration test overall cache increase is checked, as well as the cache backed by RDB as a specific TCL test is executed in isolated Github action job.
2023-02-12 08:23:29 +01:00
int rdbSaveBackground(int req, char *filename, rdbSaveInfo *rsi, int rdbflags);
int rdbSaveToSlavesSockets(int req, rdbSaveInfo *rsi);
void rdbRemoveTempFile(pid_t childpid, int from_signal);
Add RM_RdbLoad and RM_RdbSave module API functions (#11852) Add `RM_RdbLoad()` and `RM_RdbSave()` to load/save RDB files from the module API. In our use case, we have our clustering implementation as a module. As part of this implementation, the module needs to trigger RDB save operation at specific points. Also, this module delivers RDB files to other nodes (not using Redis' replication). When a node receives an RDB file, it should be able to load the RDB. Currently, there is no module API to save/load RDB files. This PR adds four new APIs: ```c RedisModuleRdbStream *RM_RdbStreamCreateFromFile(const char *filename); void RM_RdbStreamFree(RedisModuleRdbStream *stream); int RM_RdbLoad(RedisModuleCtx *ctx, RedisModuleRdbStream *stream, int flags); int RM_RdbSave(RedisModuleCtx *ctx, RedisModuleRdbStream *stream, int flags); ``` The first step is to create a `RedisModuleRdbStream` object. This PR provides a function to create RedisModuleRdbStream from the filename. (You can load/save RDB with the filename). In the future, this API can be extended if needed: e.g., `RM_RdbStreamCreateFromFd()`, `RM_RdbStreamCreateFromSocket()` to save/load RDB from an `fd` or a `socket`. Usage: ```c /* Save RDB */ RedisModuleRdbStream *stream = RedisModule_RdbStreamCreateFromFile("example.rdb"); RedisModule_RdbSave(ctx, stream, 0); RedisModule_RdbStreamFree(stream); /* Load RDB */ RedisModuleRdbStream *stream = RedisModule_RdbStreamCreateFromFile("example.rdb"); RedisModule_RdbLoad(ctx, stream, 0); RedisModule_RdbStreamFree(stream); ```
2023-04-09 11:07:32 +02:00
int rdbSaveToFile(const char *filename);
Reclaim page cache of RDB file (#11248) # Background The RDB file is usually generated and used once and seldom used again, but the content would reside in page cache until OS evicts it. A potential problem is that once the free memory exhausts, the OS have to reclaim some memory from page cache or swap anonymous page out, which may result in a jitters to the Redis service. Supposing an exact scenario, a high-capacity machine hosts many redis instances, and we're upgrading the Redis together. The page cache in host machine increases as RDBs are generated. Once the free memory drop into low watermark(which is more likely to happen in older Linux kernel like 3.10, before [watermark_scale_factor](https://lore.kernel.org/lkml/1455813719-2395-1-git-send-email-hannes@cmpxchg.org/) is introduced, the `low watermark` is linear to `min watermark`, and there'is not too much buffer space for `kswapd` to be wake up to reclaim memory), a `direct reclaim` happens, which means the process would stall to wait for memory allocation. # What the PR does The PR introduces a capability to reclaim the cache when the RDB is operated. Generally there're two cases, read and write the RDB. For read it's a little messy to address the incremental reclaim, so the reclaim is done in one go in background after the load is finished to avoid blocking the work thread. For write, incremental reclaim amortizes the work of reclaim so no need to put it into background, and the peak watermark of cache can be reduced in this way. Two cases are addresses specially, replication and restart, for both of which the cache is leveraged to speed up the processing, so the reclaim is postponed to a right time. To do this, a flag is added to`rdbSave` and `rdbLoad` to control whether the cache need to be kept, with the default value false. # Something deserve noting 1. Though `posix_fadvise` is the POSIX standard, but only few platform support it, e.g. Linux, FreeBSD 10.0. 2. In Linux `posix_fadvise` only take effect on writeback-ed pages, so a `sync`(or `fsync`, `fdatasync`) is needed to flush the dirty page before `posix_fadvise` if we reclaim write cache. # About test A unit test is added to verify the effect of `posix_fadvise`. In integration test overall cache increase is checked, as well as the cache backed by RDB as a specific TCL test is executed in isolated Github action job.
2023-02-12 08:23:29 +01:00
int rdbSave(int req, char *filename, rdbSaveInfo *rsi, int rdbflags);
ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid);
size_t rdbSavedObjectLen(robj *o, robj *key, int dbid);
robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error);
void backgroundSaveDoneHandler(int exitcode, int bysignal);
int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, long long expiretime,int dbid);
ssize_t rdbSaveSingleModuleAux(rio *rdb, int when, moduleType *mt);
robj *rdbLoadCheckModuleValue(rio *rdb, char *modulename);
robj *rdbLoadStringObject(rio *rdb);
ssize_t rdbSaveStringObject(rio *rdb, robj *obj);
ssize_t rdbSaveRawString(rio *rdb, unsigned char *s, size_t len);
void *rdbGenericLoadStringObject(rio *rdb, int flags, size_t *lenptr);
int rdbSaveBinaryDoubleValue(rio *rdb, double val);
int rdbLoadBinaryDoubleValue(rio *rdb, double *val);
int rdbSaveBinaryFloatValue(rio *rdb, float val);
int rdbLoadBinaryFloatValue(rio *rdb, float *val);
int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi);
int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadingCtx *rdb_loading_ctx);
int rdbFunctionLoad(rio *rdb, int ver, functionsLibCtx* lib_ctx, int rdbflags, sds *err);
int rdbSaveRio(int req, rio *rdb, int *error, int rdbflags, rdbSaveInfo *rsi);
ssize_t rdbSaveFunctions(rio *rdb);
rdbSaveInfo *rdbPopulateSaveInfo(rdbSaveInfo *rsi);
#endif